Spaces:

ChatbotNova
/

Chatbot-Backend

Sleeping

App Files Files Community

srilakshu012456 commited on Jan 4

Commit

0ffd0e8

verified ·

1 Parent(s): 14a4368

Update services/kb_creation.py

Browse files

Files changed (1) hide show

services/kb_creation.py +312 -8

services/kb_creation.py CHANGED Viewed

@@ -365,14 +365,318 @@ def search_knowledge_base(query: str, top_k: int = 10) -> dict:
         "ids": ids,
     }
-# ------------------------------ Hybrid search (improved + exact-match rerank) ------------------------------
-# (unchanged from your version; omitted for brevity here)
-# NOTE: Keep your existing 'hybrid_search_knowledge_base' implementation as-is.
-# It already returns best_doc, user_intent, etc.
-from collections import defaultdict
-# (Paste your existing hybrid_search_knowledge_base implementation here unchanged.)
-# ── For brevity in this reply we keep your original code intact. ──
 # ------------------------------ Section fetch helpers ------------------------------
 def get_section_text(filename: str, section: str) -> str:

         "ids": ids,
     }
+# ------------------------------ Hybrid search (generic + intent-aware) ------------------------------
+ACTION_SYNONYMS = {
+    "create": ["create", "creation", "add", "new", "generate"],
+    "update": ["update", "modify", "change", "edit"],
+    "delete": ["delete", "remove"],
+    "navigate": ["navigate", "go to", "open"],
+}
+ERROR_INTENT_TERMS = [
+    "error", "issue", "fail", "not working", "resolution", "fix",
+    "permission", "permissions", "access", "no access", "authorization", "authorisation",
+    "role", "role mapping", "not authorized", "permission denied", "insufficient privileges",
+    "escalation", "escalation path", "access right", "mismatch", "locked", "wrong"
+]
+def _detect_user_intent(query: str) -> str:
+    q = (query or "").lower()
+    if any(k in q for k in ERROR_INTENT_TERMS):
+        return "errors"
+    if any(k in q for k in ["steps", "procedure", "how to", "navigate", "process", "do", "perform"]):
+        return "steps"
+    if any(k in q for k in ["pre-requisite", "prerequisites", "requirement", "requirements"]):
+        return "prereqs"
+    if any(k in q for k in ["purpose", "overview", "introduction"]):
+        return "purpose"
+    return "neutral"
+def _extract_actions(query: str) -> List[str]:
+    q = (query or "").lower()
+    found = []
+    for act, syns in ACTION_SYNONYMS.items():
+        if any(s in q for s in syns):
+            found.append(act)
+    return sorted(set(found)) or []
+def _extract_modules_from_query(query: str) -> List[str]:
+    q = (query or "").lower()
+    found = []
+    for mod, syns in MODULE_VOCAB.items():
+        if any(s in q for s in syns):
+            found.append(mod)
+    return sorted(set(found))
+def _action_weight(text: str, actions: List[str]) -> float:
+    if not actions:
+        return 0.0
+    t = (text or "").lower()
+    score = 0.0
+    for act in actions:
+        for syn in ACTION_SYNONYMS.get(act, [act]):
+            if syn in t:
+                score += 1.0
+    conflicts = {"create": ["delete"], "delete": ["create"], "update": ["delete"], "navigate": []}
+    for act in actions:
+        for bad in conflicts.get(act, []):
+            for syn in ACTION_SYNONYMS.get(bad, [bad]):
+                if syn in t:
+                    score -= 0.8
+    return score
+def _module_weight(meta: Dict[str, Any], user_modules: List[str]) -> float:
+    if not user_modules:
+        return 0.0
+    raw = (meta or {}).get("module_tags", "") or ""
+    doc_modules = [m.strip() for m in raw.split(",") if m.strip()] if isinstance(raw, str) else (raw or [])
+    overlap = len(set(user_modules) & set(doc_modules))
+    if overlap == 0:
+        return -0.8
+    return 0.7 * overlap
+def _intent_weight(meta: dict, user_intent: str) -> float:
+    tag = (meta or {}).get("intent_tag", "neutral")
+    if user_intent == "neutral":
+        return 0.0
+    if tag == user_intent:
+        return 1.0
+    if tag in ["purpose", "prereqs"] and user_intent in ["steps", "errors"]:
+        return -0.6
+    st = ((meta or {}).get("section", "") or "").lower()
+    topics = (meta or {}).get("topic_tags", "") or ""
+    topic_list = [t.strip() for t in topics.split(",") if t.strip()]
+    if user_intent == "errors" and (
+        any(k in st for k in ["common errors", "known issues", "common issues", "errors", "escalation", "permissions", "access"])
+        or ("permissions" in topic_list)
+    ):
+        return 1.10
+    if user_intent == "steps" and any(k in st for k in ["process steps", "procedure", "instructions", "workflow"]):
+        return 0.75
+    return -0.2
+def _meta_overlap(meta: Dict[str, Any], q_terms: List[str]) -> float:
+    fn_tokens = _tokenize_meta_value(meta.get("filename"))
+    title_tokens = _tokenize_meta_value(meta.get("title"))
+    section_tokens = _tokenize_meta_value(meta.get("section"))
+    topic_tokens = _tokenize_meta_value((meta.get("topic_tags") or ""))
+    module_tokens = _tokenize_meta_value((meta.get("module_tags") or ""))
+    meta_tokens = set(fn_tokens + title_tokens + section_tokens + topic_tokens + module_tokens)
+    if not meta_tokens or not q_terms:
+        return 0.0
+    qset = set(q_terms)
+    inter = len(meta_tokens & qset)
+    return inter / max(1, len(qset))
+def _make_ngrams(tokens: List[str], n: int) -> List[str]:
+    return [" ".join(tokens[i:i+n]) for i in range(len(tokens) - n + 1)]
+def _phrase_boost_score(text: str, q_terms: List[str]) -> float:
+    if not text or not q_terms:
+        return 0.0
+    low = (text or "").lower()
+    bigrams = _make_ngrams(q_terms, 2)
+    trigrams = _make_ngrams(q_terms, 3)
+    score = 0.0
+    for bg in bigrams:
+        if bg and bg in low:
+            score += 0.40
+    for tg in trigrams:
+        if tg and tg in low:
+            score += 0.70
+    return min(score, 2.0)
+def _literal_query_match_boost(text: str, query_norm: str) -> float:
+    t = (text or "").lower()
+    q = (query_norm or "").lower()
+    boost = 0.0
+    if q and q in t:
+        boost += 0.8
+    toks = [tok for tok in q.split() if len(tok) > 2]
+    bigrams = _make_ngrams(toks, 2)
+    for bg in bigrams:
+        if bg in t:
+            boost += 0.8
+            break
+    return min(boost, 1.6)
+def hybrid_search_knowledge_base(query: str, top_k: int = 10, alpha: float = 0.6, beta: float = 0.4) -> dict:
+    """
+    Hybrid retrieval (embeddings + BM25) with intent-, action-, module-, and phrase-aware reranking.
+    Returns top items plus doc-level prior and intent for downstream formatting.
+    """
+    norm_query = _normalize_query(query)
+    q_terms = _tokenize(norm_query)
+    user_intent = _detect_user_intent(query)
+    actions = _extract_actions(query)
+    user_modules = _extract_modules_from_query(query)
+    # semantic (embeddings) search via Chroma
+    sem_res = search_knowledge_base(norm_query, top_k=max(top_k, 40))
+    sem_docs = sem_res.get("documents", [])
+    sem_metas = sem_res.get("metadatas", [])
+    sem_dists = sem_res.get("distances", [])
+    sem_ids = sem_res.get("ids", [])
+    def dist_to_sim(d: Optional[float]) -> float:
+        if d is None:
+            return 0.0
+        try:
+            return 1.0 / (1.0 + float(d))
+        except Exception:
+            return 0.0
+    sem_sims = [dist_to_sim(d) for d in sem_dists]
+    # BM25 search
+    bm25_hits = bm25_search(norm_query, top_k=max(80, top_k * 6))
+    bm25_max = max([s for _, s in bm25_hits], default=1.0)
+    bm25_norm_pairs = [(idx, (score / bm25_max) if bm25_max > 0 else 0.0) for idx, score in bm25_hits]
+    bm25_id_to_norm, bm25_id_to_text, bm25_id_to_meta = {}, {}, {}
+    for idx, nscore in bm25_norm_pairs:
+        d = bm25_docs[idx]
+        bm25_id_to_norm[d["id"]] = nscore
+        bm25_id_to_text[d["id"]] = d["text"]
+        bm25_id_to_meta[d["id"]] = d["meta"]
+    # union of candidate IDs (semantic + bm25)
+    union_ids = set(sem_ids) | set(bm25_id_to_norm.keys())
+    # weights
+    gamma = 0.30  # meta overlap
+    delta = 0.55  # intent boost
+    epsilon = 0.30  # action weight
+    zeta = 0.65  # module weight
+    eta = 0.50  # phrase-level boost
+    theta = 0.00  # optional heading alignment bonus not used
+    iota = 0.60  # literal query match boost
+    combined_records_ext: List[Tuple[str, float, float, str, Dict[str, Any], float, float, float, float, float, float, float]] = []
+    for cid in union_ids:
+        # pick semantic fields if present; fallback to bm25
+        if cid in sem_ids:
+            pos = sem_ids.index(cid)
+            sem_sim = sem_sims[pos] if pos < len(sem_sims) else 0.0
+            sem_dist = sem_dists[pos] if pos < len(sem_dists) else None
+            sem_text = sem_docs[pos] if pos < len(sem_docs) else ""
+            sem_meta = sem_metas[pos] if pos < len(sem_metas) else {}
+        else:
+            sem_sim, sem_dist, sem_text, sem_meta = 0.0, None, "", {}
+        bm25_sim = bm25_id_to_norm.get(cid, 0.0)
+        bm25_text = bm25_id_to_text.get(cid, "")
+        bm25_meta = bm25_id_to_meta.get(cid, {})
+        text = sem_text if sem_text else bm25_text
+        meta = sem_meta if sem_meta else bm25_meta
+        m_overlap = _meta_overlap(meta, q_terms)
+        intent_boost = _intent_weight(meta, user_intent)
+        act_wt = _action_weight(text, actions)
+        mod_wt = _module_weight(meta, user_modules)
+        phrase_wt = _phrase_boost_score(text, q_terms)
+        literal_wt = _literal_query_match_boost(text, norm_query)
+        final_score = (
+            alpha * sem_sim
+            + beta * bm25_sim
+            + gamma * m_overlap
+            + delta * intent_boost
+            + epsilon * act_wt
+            + zeta * mod_wt
+            + eta * phrase_wt
+            + theta * 0.0
+            + iota * literal_wt
+        )
+        combined_records_ext.append(
+            (cid, final_score, (sem_dist if sem_dist is not None else 999.0), text, meta,
+             m_overlap, intent_boost, act_wt, mod_wt, phrase_wt, 0.0, literal_wt)
+        )
+    # exact-match rerank for errors (push lines containing query phrases)
+    if user_intent == "errors":
+        exact_hits = []
+        toks = [tok for tok in norm_query.split() if len(tok) > 2]
+        bigrams = _make_ngrams(toks, 2)
+        for rec in combined_records_ext:
+            text_lower = (rec[3] or "").lower()
+            if norm_query and norm_query in text_lower:
+                exact_hits.append(rec)
+                continue
+            if any(bg in text_lower for bg in bigrams):
+                exact_hits.append(rec)
+        if exact_hits:
+            rest = [r for r in combined_records_ext if r not in exact_hits]
+            exact_hits.sort(key=lambda x: x[1], reverse=True)
+            rest.sort(key=lambda x: x[1], reverse=True)
+            combined_records_ext = exact_hits + rest
+    # doc-level prior: prefer docs with more aligned chunks
+    from collections import defaultdict as _dd
+    doc_groups: Dict[str, List[Tuple[str, float, float, str, Dict[str, Any], float, float, float, float, float, float, float]]] = _dd(list)
+    for rec in combined_records_ext:
+        meta = rec[4] or {}
+        fn = meta.get("filename", "unknown")
+        doc_groups[fn].append(rec)
+    def doc_prior(recs: List[Tuple[str, float, float, str, Dict[str, Any], float, float, float, float, float, float, float]]) -> float:
+        total_score = sum(r[1] for r in recs)
+        total_overlap = sum(r[5] for r in recs)
+        total_intent = sum(max(0.0, r[6]) for r in recs)
+        total_action = sum(max(0.0, r[7]) for r in recs)
+        total_module = sum(r[8] for r in recs)
+        total_phrase = sum(r[9] for r in recs)
+        total_literal = sum(r[11] for r in recs)
+        total_penalty = sum(min(0.0, r[6]) for r in recs) + sum(min(0.0, r[7]) for r in recs)
+        errors_section_bonus = 0.0
+        if any("error" in ((r[4] or {}).get("section", "")).lower() or
+               "known issues" in ((r[4] or {}).get("section", "")).lower() or
+               "common issues" in ((r[4] or {}).get("section", "")).lower() for r in recs):
+            errors_section_bonus = 0.5
+        return (
+            total_score
+            + 0.4 * total_overlap
+            + 0.7 * total_intent
+            + 0.5 * total_action
+            + 0.8 * total_module
+            + 0.6 * total_phrase
+            + 0.7 * total_literal
+            + errors_section_bonus
+            + 0.3 * total_penalty
+        )
+    best_doc, best_doc_prior = None, -1.0
+    for fn, recs in doc_groups.items():
+        p = doc_prior(recs)
+        if p > best_doc_prior:
+            best_doc_prior, best_doc = p, fn
+    best_recs = sorted(doc_groups.get(best_doc, []), key=lambda x: x[1], reverse=True)
+    other_recs: List[Tuple[str, float, float, str, Dict[str, Any], float, float, float, float, float, float, float]] = []
+    for fn, recs in doc_groups.items():
+        if fn == best_doc:
+            continue
+        other_recs.extend(recs)
+    other_recs.sort(key=lambda x: x[1], reverse=True)
+    reordered = best_recs + other_recs
+    top = reordered[:top_k]
+    documents = [t[3] for t in top]
+    metadatas = [t[4] for t in top]
+    distances = [t[2] for t in top]
+    ids = [t[0] for t in top]
+    combined_scores = [t[1] for t in top]
+    return {
+        "documents": documents,
+        "metadatas": metadatas,
+        "distances": distances,
+        "ids": ids,
+        "combined_scores": combined_scores,
+        "best_doc": best_doc,
+        "best_doc_prior": best_doc_prior,
+        "user_intent": user_intent,
+        "actions": actions,
+    }
 # ------------------------------ Section fetch helpers ------------------------------
 def get_section_text(filename: str, section: str) -> str: