Spaces:

ChatbotNova
/

Chatbot-Backend

Sleeping

App Files Files Community

srilakshu012456 commited on Dec 24, 2025

Commit

125282c

verified ·

1 Parent(s): a226b7b

Update services/kb_creation.py

Browse files

Files changed (1) hide show

services/kb_creation.py +87 -35

services/kb_creation.py CHANGED Viewed

@@ -43,6 +43,8 @@ def _tokenize_meta_value(val: Optional[str]) -> List[str]:
     return _tokenize(val or "")
 # ---------------------------- DOCX parsing & chunking ----------------------------
 def _split_by_sections(doc: Document) -> List[Tuple[str, List[str]]]:
     sections: List[Tuple[str, List[str]]] = []
     current_title = None
@@ -66,18 +68,57 @@ def _split_by_sections(doc: Document) -> List[Tuple[str, List[str]]]:
         sections = [("Document", all_text)]
     return sections
-def _chunk_text_with_context(doc_title: str, section_title: str, paragraphs: List[str], max_words: int = 900) -> List[str]:
-    body = "\n".join(paragraphs).strip()
-    if not body:
-        return []
-    words = body.split()
     chunks: List[str] = []
-    for i in range(0, len(words), max_words):
-        chunk_body = ' '.join(words[i:i + max_words]).strip()
-        if chunk_body:
-            chunks.append(chunk_body)
     if not chunks:
-        chunks = [body]
     return chunks
 # ---------------------------- Intent & Module tagging ----------------------------
@@ -89,7 +130,7 @@ PERMISSION_TERMS = [
     "role", "role access", "role mapping", "security", "security profile", "privilege", "insufficient",
     "not allowed", "not authorized", "denied", "restrict"
 ]
-ERROR_TERMS = ["error", "issue", "fail", "failure", "not working", "cannot", "can't"]
 STEP_VERBS = ["navigate", "select", "scan", "verify", "confirm", "print", "move", "complete", "click", "open", "choose", "enter", "update", "save", "delete", "create", "attach", "assign"]
 MODULE_VOCAB = {
@@ -179,7 +220,7 @@ def ingest_documents(folder_path: str) -> None:
         total_chunks = 0
         for s_idx, (section_title, paras) in enumerate(sections):
-            chunks = _chunk_text_with_context(doc_title, section_title, paras, max_words=900)
             total_chunks += len(chunks)
             base_intent = _infer_intent_tag(section_title)
@@ -214,6 +255,7 @@ def ingest_documents(folder_path: str) -> None:
                     except Exception as e2:
                         print(f"[KB] ERROR: Upsert failed for {doc_id}: {e2}")
                 tokens = _tokenize(chunk)
                 tf: Dict[str, int] = {}
                 for tkn in tokens:
@@ -353,7 +395,7 @@ def search_knowledge_base(query: str, top_k: int = 10) -> dict:
         "ids": ids,
     }
-# ---------------------------- Hybrid search (robust) ----------------------------
 ACTION_SYNONYMS = {
     "create": ["create", "creation", "add", "new", "generate"],
     "update": ["update", "modify", "change", "edit"],
@@ -364,7 +406,7 @@ ERROR_INTENT_TERMS = [
     "error", "issue", "fail", "not working", "resolution", "fix",
     "permission", "permissions", "access", "no access", "authorization", "authorisation",
     "role", "role mapping", "not authorized", "permission denied", "insufficient privileges",
-    "escalation", "escalation path", "access right"
 ]
 def _detect_user_intent(query: str) -> str:
@@ -445,12 +487,6 @@ def _intent_weight(meta: dict, user_intent: str) -> float:
         return 0.75
     return -0.2
-def _normalize_for_match(text: str) -> str:
-    t = (text or "").lower()
-    t = re.sub(r"[^\w\s]", " ", t)
-    t = re.sub(r"\s+", " ", t).strip()
-    return t
 def _meta_overlap(meta: Dict[str, Any], q_terms: List[str]) -> float:
     fn_tokens = _tokenize_meta_value(meta.get("filename"))
     title_tokens = _tokenize_meta_value(meta.get("title"))
@@ -482,20 +518,31 @@ def _phrase_boost_score(text: str, q_terms: List[str]) -> float:
             score += 0.60
     return min(score, 1.5)
 def hybrid_search_knowledge_base(query: str, top_k: int = 10, alpha: float = 0.6, beta: float = 0.4) -> dict:
     norm_query = _normalize_query(query)
     q_terms = _tokenize(norm_query)
     user_intent = _detect_user_intent(query)
-    # Robust guards so missing helpers can’t crash
-    try:
-        actions = _extract_actions(query)
-    except Exception:
-        actions = []
-    try:
-        user_modules = _extract_modules_from_query(query)
-    except Exception:
-        user_modules = []
     sem_res = search_knowledge_base(norm_query, top_k=max(top_k, 30))
     sem_docs = sem_res.get("documents", [])
@@ -531,8 +578,9 @@ def hybrid_search_knowledge_base(query: str, top_k: int = 10, alpha: float = 0.6
     zeta = 0.65    # module weight
     eta = 0.45     # phrase-level boost
     theta = 0.40   # heading alignment bonus
-    combined_records_ext: List[Tuple[str, float, float, str, Dict[str, Any], float, float, float, float, float, float]] = []
     for cid in union_ids:
         if cid in sem_ids:
             pos = sem_ids.index(cid)
@@ -555,6 +603,7 @@ def hybrid_search_knowledge_base(query: str, top_k: int = 10, alpha: float = 0.6
         act_wt = _action_weight(text, actions)
         mod_wt = _module_weight(meta, user_modules)
         phrase_wt = _phrase_boost_score(text, q_terms)
         sec_low = ((meta or {}).get("section", "") or "").lower()
         title_low = ((meta or {}).get("title", "") or "").lower()
@@ -575,20 +624,21 @@ def hybrid_search_knowledge_base(query: str, top_k: int = 10, alpha: float = 0.6
             + zeta * mod_wt
             + eta * phrase_wt
             + theta * heading_bonus
         )
         combined_records_ext.append(
-            (cid, final_score, (sem_dist if sem_dist is not None else 999.0), text, meta, m_overlap, intent_boost, act_wt, mod_wt, phrase_wt, heading_bonus)
         )
     from collections import defaultdict
-    doc_groups: Dict[str, List[Tuple[str, float, float, str, Dict[str, Any], float, float, float, float, float, float]]] = defaultdict(list)
     for rec in combined_records_ext:
         meta = rec[4] or {}
         fn = meta.get("filename", "unknown")
         doc_groups[fn].append(rec)
-    def doc_prior(recs: List[Tuple[str, float, float, str, Dict[str, Any], float, float, float, float, float, float]]) -> float:
         total_score = sum(r[1] for r in recs)
         total_overlap = sum(r[5] for r in recs)
         total_intent = sum(max(0.0, r[6]) for r in recs)
@@ -596,6 +646,7 @@ def hybrid_search_knowledge_base(query: str, top_k: int = 10, alpha: float = 0.6
         total_module = sum(r[8] for r in recs)
         total_phrase = sum(r[9] for r in recs)
         total_heading = sum(r[10] for r in recs)
         total_penalty = sum(min(0.0, r[6]) for r in recs) + sum(min(0.0, r[7]) for r in recs)
         esc_weight = 0.3 if any("escalation" in ((r[4] or {}).get("section", "")).lower() for r in recs) else 0.0
         perm_weight = 0.3 if any("permissions" in (((r[4] or {}).get("topic_tags") or [])) for r in recs) else 0.0
@@ -607,6 +658,7 @@ def hybrid_search_knowledge_base(query: str, top_k: int = 10, alpha: float = 0.6
             + 0.8 * total_module
             + 0.6 * total_phrase
             + 0.6 * total_heading
             + 0.3 * total_penalty
             + esc_weight + perm_weight
         )
@@ -618,7 +670,7 @@ def hybrid_search_knowledge_base(query: str, top_k: int = 10, alpha: float = 0.6
             best_doc_prior, best_doc = p, fn
     best_recs = sorted(doc_groups.get(best_doc, []), key=lambda x: x[1], reverse=True)
-    other_recs: List[Tuple[str, float, float, str, Dict[str, Any], float, float, float, float, float, float]] = []
     for fn, recs in doc_groups.items():
         if fn == best_doc:
             continue

     return _tokenize(val or "")
 # ---------------------------- DOCX parsing & chunking ----------------------------
+BULLET_RE = re.compile(r"^\s*(?:[\-\*\u2022]|\d+[.)])\s+", re.IGNORECASE)
 def _split_by_sections(doc: Document) -> List[Tuple[str, List[str]]]:
     sections: List[Tuple[str, List[str]]] = []
     current_title = None
         sections = [("Document", all_text)]
     return sections
+def _paragraphs_to_lines(paragraphs: List[str]) -> List[str]:
+    """
+    Split paragraphs into bullet-aware lines:
+    - Preserve bullets and numbered list lines as separate atomic lines.
+    - Split long paragraphs by '. ' into manageable lines.
+    """
+    lines: List[str] = []
+    for p in (paragraphs or []):
+        p = (p or "").strip()
+        if not p:
+            continue
+        # If looks like a bullet/numbered item, keep as is
+        if BULLET_RE.match(p):
+            lines.append(p)
+            continue
+        # Otherwise split by sentence boundaries
+        parts = [s.strip() for s in re.split(r"(?<=[.!?])\s+", p) if s.strip()]
+        lines.extend(parts)
+    return lines
+def _chunk_text_with_context(doc_title: str, section_title: str, paragraphs: List[str], max_words: int = 300) -> List[str]:
+    """
+    Smaller chunks for better recall; bullet-aware.
+    """
+    lines = _paragraphs_to_lines(paragraphs)
     chunks: List[str] = []
+    current: List[str] = []
+    current_len = 0
+    for ln in lines:
+        w = ln.split()
+        if current_len + len(w) > max_words or (BULLET_RE.match(ln) and current):
+            # close current chunk
+            chunk = " ".join(current).strip()
+            if chunk:
+                chunks.append(chunk)
+            current = [ln]
+            current_len = len(w)
+        else:
+            current.append(ln)
+            current_len += len(w)
+    if current:
+        chunk = " ".join(current).strip()
+        if chunk:
+            chunks.append(chunk)
     if not chunks:
+        body = " ".join(lines).strip()
+        if body:
+            chunks = [body]
     return chunks
 # ---------------------------- Intent & Module tagging ----------------------------
     "role", "role access", "role mapping", "security", "security profile", "privilege", "insufficient",
     "not allowed", "not authorized", "denied", "restrict"
 ]
+ERROR_TERMS = ["error", "issue", "fail", "failure", "not working", "cannot", "can't", "mismatch", "locked", "wrong", "denied"]
 STEP_VERBS = ["navigate", "select", "scan", "verify", "confirm", "print", "move", "complete", "click", "open", "choose", "enter", "update", "save", "delete", "create", "attach", "assign"]
 MODULE_VOCAB = {
         total_chunks = 0
         for s_idx, (section_title, paras) in enumerate(sections):
+            chunks = _chunk_text_with_context(doc_title, section_title, paras, max_words=300)
             total_chunks += len(chunks)
             base_intent = _infer_intent_tag(section_title)
                     except Exception as e2:
                         print(f"[KB] ERROR: Upsert failed for {doc_id}: {e2}")
+                # Build BM25 index entries
                 tokens = _tokenize(chunk)
                 tf: Dict[str, int] = {}
                 for tkn in tokens:
         "ids": ids,
     }
+# ---------------------------- Hybrid search (improved) ----------------------------
 ACTION_SYNONYMS = {
     "create": ["create", "creation", "add", "new", "generate"],
     "update": ["update", "modify", "change", "edit"],
     "error", "issue", "fail", "not working", "resolution", "fix",
     "permission", "permissions", "access", "no access", "authorization", "authorisation",
     "role", "role mapping", "not authorized", "permission denied", "insufficient privileges",
+    "escalation", "escalation path", "access right", "mismatch", "locked", "wrong"
 ]
 def _detect_user_intent(query: str) -> str:
         return 0.75
     return -0.2
 def _meta_overlap(meta: Dict[str, Any], q_terms: List[str]) -> float:
     fn_tokens = _tokenize_meta_value(meta.get("filename"))
     title_tokens = _tokenize_meta_value(meta.get("title"))
             score += 0.60
     return min(score, 1.5)
+def _literal_query_match_boost(text: str, query_norm: str) -> float:
+    """
+    Extra boost if the exact normalized query substring (or key tokens) appear in the chunk.
+    Helps errors like 'item mismatch' pick the right KB line.
+    """
+    t = (text or "").lower()
+    q = (query_norm or "").lower()
+    boost = 0.0
+    if q and q in t:
+        boost += 0.6
+    # Also check key 2-word error tokens present in query (e.g., 'item mismatch')
+    toks = [tok for tok in q.split() if len(tok) > 2]
+    bigrams = _make_ngrams(toks, 2)
+    for bg in bigrams:
+        if bg in t:
+            boost += 0.6
+            break
+    return min(boost, 1.2)
 def hybrid_search_knowledge_base(query: str, top_k: int = 10, alpha: float = 0.6, beta: float = 0.4) -> dict:
     norm_query = _normalize_query(query)
     q_terms = _tokenize(norm_query)
     user_intent = _detect_user_intent(query)
+    actions = _extract_actions(query)
+    user_modules = _extract_modules_from_query(query)
     sem_res = search_knowledge_base(norm_query, top_k=max(top_k, 30))
     sem_docs = sem_res.get("documents", [])
     zeta = 0.65    # module weight
     eta = 0.45     # phrase-level boost
     theta = 0.40   # heading alignment bonus
+    iota = 0.40    # literal query match boost
+    combined_records_ext: List[Tuple[str, float, float, str, Dict[str, Any], float, float, float, float, float, float, float]] = []
     for cid in union_ids:
         if cid in sem_ids:
             pos = sem_ids.index(cid)
         act_wt = _action_weight(text, actions)
         mod_wt = _module_weight(meta, user_modules)
         phrase_wt = _phrase_boost_score(text, q_terms)
+        literal_wt = _literal_query_match_boost(text, norm_query)
         sec_low = ((meta or {}).get("section", "") or "").lower()
         title_low = ((meta or {}).get("title", "") or "").lower()
             + zeta * mod_wt
             + eta * phrase_wt
             + theta * heading_bonus
+            + iota * literal_wt
         )
         combined_records_ext.append(
+            (cid, final_score, (sem_dist if sem_dist is not None else 999.0), text, meta, m_overlap, intent_boost, act_wt, mod_wt, phrase_wt, heading_bonus, literal_wt)
         )
     from collections import defaultdict
+    doc_groups: Dict[str, List[Tuple[str, float, float, str, Dict[str, Any], float, float, float, float, float, float, float]]] = defaultdict(list)
     for rec in combined_records_ext:
         meta = rec[4] or {}
         fn = meta.get("filename", "unknown")
         doc_groups[fn].append(rec)
+    def doc_prior(recs: List[Tuple[str, float, float, str, Dict[str, Any], float, float, float, float, float, float, float]]) -> float:
         total_score = sum(r[1] for r in recs)
         total_overlap = sum(r[5] for r in recs)
         total_intent = sum(max(0.0, r[6]) for r in recs)
         total_module = sum(r[8] for r in recs)
         total_phrase = sum(r[9] for r in recs)
         total_heading = sum(r[10] for r in recs)
+        total_literal = sum(r[11] for r in recs)
         total_penalty = sum(min(0.0, r[6]) for r in recs) + sum(min(0.0, r[7]) for r in recs)
         esc_weight = 0.3 if any("escalation" in ((r[4] or {}).get("section", "")).lower() for r in recs) else 0.0
         perm_weight = 0.3 if any("permissions" in (((r[4] or {}).get("topic_tags") or [])) for r in recs) else 0.0
             + 0.8 * total_module
             + 0.6 * total_phrase
             + 0.6 * total_heading
+            + 0.6 * total_literal
             + 0.3 * total_penalty
             + esc_weight + perm_weight
         )
             best_doc_prior, best_doc = p, fn
     best_recs = sorted(doc_groups.get(best_doc, []), key=lambda x: x[1], reverse=True)
+    other_recs: List[Tuple[str, float, float, str, Dict[str, Any], float, float, float, float, float, float, float]] = []
     for fn, recs in doc_groups.items():
         if fn == best_doc:
             continue