Spaces:

ChatbotNova
/

Chatbot-Backend

Sleeping

App Files Files Community

srilakshu012456 commited on Dec 23, 2025

Commit

6f19669

verified ·

1 Parent(s): 7815846

Update services/kb_creation.py

Browse files

Files changed (1) hide show

services/kb_creation.py +183 -111

services/kb_creation.py CHANGED Viewed

@@ -1,4 +1,3 @@
 import os
 import re
 import pickle
@@ -13,10 +12,10 @@ CHROMA_PATH = os.path.join(os.getcwd(), "chroma_db")
 client = chromadb.PersistentClient(path=CHROMA_PATH)
 collection = client.get_or_create_collection(name="knowledge_base")
-# --------------------------- Embedding model ---------------------------
 model = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2")
-# --------------------------- BM25 (lightweight) ---------------------------
 BM25_INDEX_FILE = os.path.join(CHROMA_PATH, "bm25_index.pkl")
 bm25_docs: List[Dict[str, Any]] = []
 bm25_inverted: Dict[str, List[int]] = {}
@@ -26,16 +25,18 @@ bm25_ready: bool = False
 BM25_K1 = 1.5
 BM25_B = 0.75
-# --------------------------- Utilities ---------------------------
 def _tokenize(text: str) -> List[str]:
     if not text:
         return []
     text = text.lower()
     return re.findall(r"[a-z0-9]+", text)
 def _normalize_query(q: str) -> str:
     q = (q or "").strip().lower()
     q = re.sub(r"[^\w\s]", " ", q)
     q = re.sub(
         r"\b(facing|get|getting|got|seeing|receiving|encountered|having|observing|issue|problem)\b",
         " ",
@@ -44,43 +45,12 @@ def _normalize_query(q: str) -> str:
     q = re.sub(r"\s+", " ", q).strip()
     return q
 def _tokenize_meta_value(val: Optional[str]) -> List[str]:
     return _tokenize(val or "")
-# --------------------------- Semantic intent prototypes ---------------------------
-INTENT_PROTOTYPES: Dict[str, str] = {
-    "steps":       "Step-by-step procedure with actions the user must perform",
-    "navigation":  "Menu paths and locations in WMS, for example Navigate to Inbound > Receiving",
-    "errors":      "Common errors and resolution tips or troubleshooting guidance",
-    "prereqs":     "Pre-requisites, authorization, requirements before executing steps",
-    "purpose":     "Purpose, overview, introduction that explains why something is done",
-    "escalation":  "Escalation path or who to contact if the issue cannot be resolved",
-    "permission":  "User lacks authorization or access denied and needs role access check",
-}
-# Precompute prototype embeddings once
-PROTO_EMBS: Dict[str, List[float]] = {label: model.encode(text).tolist() for label, text in INTENT_PROTOTYPES.items()}
-def _embed(txt: str) -> List[float]:
-    return model.encode((txt or "").strip()).tolist()
-def _cos_sim(a: List[float], b: List[float]) -> float:
-    # pure-python cosine similarity
-    dot = sum(x * y for x, y in zip(a, b))
-    na = math.sqrt(sum(x * x for x in a)) + 1e-9
-    nb = math.sqrt(sum(y * y for y in b)) + 1e-9
-    return float(dot / (na * nb))
-def detect_user_intent(query: str) -> Tuple[str, float]:
-    q_vec = _embed(query or "")
-    best, best_s = "neutral", 0.0
-    for label, proto_vec in PROTO_EMBS.items():
-        s = _cos_sim(q_vec, proto_vec)
-        if s > best_s:
-            best, best_s = label, s
-    return best, best_s  # (intent label, confidence approx 0..1)
-# --------------------------- DOCX parsing & chunking ---------------------------
 def _split_by_sections(doc: Document) -> List[Tuple[str, List[str]]]:
     sections: List[Tuple[str, List[str]]] = []
     current_title = None
@@ -104,8 +74,8 @@ def _split_by_sections(doc: Document) -> List[Tuple[str, List[str]]]:
         sections = [("Document", all_text)]
     return sections
 def _chunk_text_with_context(doc_title: str, section_title: str, paragraphs: List[str], max_words: int = 900) -> List[str]:
-    # Store only body text (no titles/headers in chunk) so users never see SOP headers
     body = "\n".join(paragraphs).strip()
     if not body:
         return []
@@ -119,7 +89,22 @@ def _chunk_text_with_context(doc_title: str, section_title: str, paragraphs: Lis
         chunks = [body]
     return chunks
-# --------------------------- Ingestion ---------------------------
 def ingest_documents(folder_path: str) -> None:
     print(f"📂 Checking folder: {folder_path}")
     files = [f for f in os.listdir(folder_path) if f.lower().endswith('.docx')]
@@ -138,20 +123,10 @@ def ingest_documents(folder_path: str) -> None:
         doc = Document(file_path)
         sections = _split_by_sections(doc)
         total_chunks = 0
         for s_idx, (section_title, paras) in enumerate(sections):
             chunks = _chunk_text_with_context(doc_title, section_title, paras, max_words=900)
             total_chunks += len(chunks)
-            # --- Semantic section intent tagging (no keywords to maintain) ---
-            section_text_for_tag = (section_title or "") + "\n" + ("\n".join(paras[:6]) if paras else "")
-            sec_vec = _embed(section_text_for_tag)
-            best_intent, best_score = "neutral", 0.0
-            for label, proto_vec in PROTO_EMBS.items():
-                s = _cos_sim(sec_vec, proto_vec)
-                if s > best_score:
-                    best_intent, best_score = label, s
             for c_idx, chunk in enumerate(chunks):
                 embedding = model.encode(chunk).tolist()
                 doc_id = f"{file}:{s_idx}:{c_idx}"
@@ -161,8 +136,7 @@ def ingest_documents(folder_path: str) -> None:
                     "chunk_index": c_idx,
                     "title": doc_title,
                     "collection": "SOP",
-                    "intent_tag": best_intent,
-                    "intent_score": best_score,
                 }
                 try:
                     collection.add(ids=[doc_id], embeddings=[embedding], documents=[chunk], metadatas=[meta])
@@ -173,28 +147,24 @@ def ingest_documents(folder_path: str) -> None:
                     except Exception as e2:
                         print(f"❌ Upsert failed for {doc_id}: {e2}")
-                # BM25 indexing
                 tokens = _tokenize(chunk)
                 tf: Dict[str, int] = {}
                 for t in tokens:
                     tf[t] = tf.get(t, 0) + 1
                 idx = len(bm25_docs)
                 bm25_docs.append({"id": doc_id, "text": chunk, "tokens": tokens, "tf": tf, "length": len(tokens), "meta": meta})
                 seen = set()
                 for term in tf.keys():
                     bm25_inverted.setdefault(term, []).append(idx)
                     if term not in seen:
                         bm25_df[term] = bm25_df.get(term, 0) + 1
                         seen.add(term)
         print(f"📄 Ingested {file} → {total_chunks} chunks")
     N = len(bm25_docs)
     if N > 0:
         bm25_avgdl = sum(d["length"] for d in bm25_docs) / float(N)
         bm25_ready = True
     payload = {
         "bm25_docs": bm25_docs,
         "bm25_inverted": bm25_inverted,
@@ -209,6 +179,7 @@ def ingest_documents(folder_path: str) -> None:
     print(f"✅ BM25 index saved: {BM25_INDEX_FILE}")
     print(f"✅ Documents ingested. Total entries in Chroma: {collection.count()}")
 def _load_bm25_index() -> None:
     global bm25_docs, bm25_inverted, bm25_df, bm25_avgdl, bm25_ready
     if not os.path.exists(BM25_INDEX_FILE):
@@ -226,9 +197,11 @@ def _load_bm25_index() -> None:
     except Exception as e:
         print(f"⚠️ Could not load BM25 index: {e}")
 _load_bm25_index()
-# --------------------------- BM25 search ---------------------------
 def _bm25_score_for_doc(query_terms: List[str], doc_idx: int) -> float:
     if not bm25_ready or doc_idx < 0 or doc_idx >= len(bm25_docs):
         return 0.0
@@ -252,6 +225,7 @@ def _bm25_score_for_doc(query_terms: List[str], doc_idx: int) -> float:
         score += idf * ((tf * (BM25_K1 + 1)) / (denom or 1.0))
     return score
 def bm25_search(query: str, top_k: int = 50) -> List[Tuple[int, float]]:
     if not bm25_ready:
         return []
@@ -273,35 +247,19 @@ def bm25_search(query: str, top_k: int = 50) -> List[Tuple[int, float]]:
     scored.sort(key=lambda x: x[1], reverse=True)
     return scored[:top_k]
-# --------------------------- Semantic-only (Chroma) ---------------------------
 def search_knowledge_base(query: str, top_k: int = 10) -> dict:
     query_embedding = model.encode(query).tolist()
     res = collection.query(
         query_embeddings=[query_embedding],
         n_results=top_k,
-        include=['documents', 'metadatas', 'distances']
     )
-    docs_ll = res.get("documents", [[]]) or [[]]
-    metas_ll = res.get("metadatas", [[]]) or [[]]
-    dists_ll = res.get("distances", [[]]) or [[]]
-    ids_ll = res.get("ids", [[]]) or [[]]
-    documents = docs_ll[0] if docs_ll else []
-    metadatas = metas_ll[0] if metas_ll else []
-    distances = dists_ll[0] if dists_ll else []
-    ids = ids_ll[0] if ids_ll else []
-    if not ids and documents:
-        synthesized = []
-        for i, m in enumerate(metadatas):
-            fn = (m or {}).get("filename", "unknown")
-            sec = (m or {}).get("section", "section")
-            idx = (m or {}).get("chunk_index", i)
-            synthesized.append(f"{fn}:{sec}:{idx}")
-        ids = synthesized
-    print(f"🔎 KB search → {len(documents)} docs (top_k={top_k}); "
-          f"first distance: {distances[0] if distances else 'n/a'}; ids={len(ids)}")
     return {
         "documents": documents,
         "metadatas": metadatas,
@@ -309,7 +267,77 @@ def search_knowledge_base(query: str, top_k: int = 10) -> dict:
         "ids": ids,
     }
-# --------------------------- Hybrid (BM25 + Embeddings + Semantic Intent) ---------------------------
 def _meta_overlap(meta: Dict[str, Any], q_terms: List[str]) -> float:
     fn_tokens = _tokenize_meta_value(meta.get("filename"))
     title_tokens = _tokenize_meta_value(meta.get("title"))
@@ -321,11 +349,48 @@ def _meta_overlap(meta: Dict[str, Any], q_terms: List[str]) -> float:
     inter = len(meta_tokens & qset)
     return inter / max(1, len(qset))
 def hybrid_search_knowledge_base(query: str, top_k: int = 10, alpha: float = 0.6, beta: float = 0.4) -> dict:
     norm_query = _normalize_query(query)
     q_terms = _tokenize(norm_query)
-    user_intent, intent_conf = detect_user_intent(query)  # semantic
     sem_res = search_knowledge_base(norm_query, top_k=max(top_k, 30))
     sem_docs = sem_res.get("documents", [])
     sem_metas = sem_res.get("metadatas", [])
@@ -342,10 +407,10 @@ def hybrid_search_knowledge_base(query: str, top_k: int = 10, alpha: float = 0.6
     sem_sims = [dist_to_sim(d) for d in sem_dists]
     bm25_hits = bm25_search(norm_query, top_k=max(50, top_k * 5))
     bm25_max = max([s for _, s in bm25_hits], default=1.0)
     bm25_norm_pairs = [(idx, (score / bm25_max) if bm25_max > 0 else 0.0) for idx, score in bm25_hits]
     bm25_id_to_norm, bm25_id_to_text, bm25_id_to_meta = {}, {}, {}
     for idx, nscore in bm25_norm_pairs:
         d = bm25_docs[idx]
@@ -355,8 +420,13 @@ def hybrid_search_knowledge_base(query: str, top_k: int = 10, alpha: float = 0.6
     union_ids = set(sem_ids) | set(bm25_id_to_norm.keys())
-    gamma = 0.25  # metadata overlap weight
-    combined_records_ext: List[Tuple[str, float, float, str, Dict[str, Any], float, float]] = []  # id, score, dist, text, meta, overlap, intentBoost
     for cid in union_ids:
         if cid in sem_ids:
@@ -375,38 +445,38 @@ def hybrid_search_knowledge_base(query: str, top_k: int = 10, alpha: float = 0.6
         text = sem_text if sem_text else bm25_text
         meta = sem_meta if sem_meta else bm25_meta
-        m_overlap = _meta_overlap(meta, q_terms)
-        tag = (meta or {}).get("intent_tag", "neutral")
-        tag_conf = float((meta or {}).get("intent_score", 0.0))
-        # Semantic intent boost (no keyword list)
-        intent_boost = 0.0
-        if user_intent != "neutral":
-            if tag == user_intent:
-                intent_boost = 0.7 * (0.5 + 0.5 * tag_conf)  # stronger if section is confidently tagged
-            elif tag_conf > 0.4:
-                intent_boost = -0.3 * tag_conf             # soft penalty if clearly different and confident
-        final_score = alpha * sem_sim + beta * bm25_sim + gamma * m_overlap + intent_boost
         combined_records_ext.append(
-            (cid, final_score, (sem_dist if sem_dist is not None else 999.0), text, meta, m_overlap, intent_boost)
         )
-    # ---------------- Document-level voting prior ----------------
     from collections import defaultdict
-    doc_groups: Dict[str, List[Tuple[str, float, float, str, Dict[str, Any], float, float]]] = defaultdict(list)
     for rec in combined_records_ext:
         meta = rec[4] or {}
         fn = meta.get("filename", "unknown")
         doc_groups[fn].append(rec)
-    def doc_prior(recs: List[Tuple[str, float, float, str, Dict[str, Any], float, float]]) -> float:
         total_score = sum(r[1] for r in recs)
         total_overlap = sum(r[5] for r in recs)
-        total_intent = sum(max(0.0, r[6]) for r in recs)  # positive boosts
-        total_penalty = sum(min(0.0, r[6]) for r in recs) # penalties
-        return total_score + 0.4 * total_overlap + 0.6 * total_intent + 0.3 * total_penalty
     best_doc, best_doc_prior = None, -1.0
     for fn, recs in doc_groups.items():
@@ -421,10 +491,8 @@ def hybrid_search_knowledge_base(query: str, top_k: int = 10, alpha: float = 0.6
             continue
         other_recs.extend(recs)
     other_recs.sort(key=lambda x: x[1], reverse=True)
     reordered = best_recs + other_recs
     top = reordered[:top_k]
     documents = [t[3] for t in top]
     metadatas = [t[4] for t in top]
     distances = [t[2] for t in top]
@@ -440,10 +508,11 @@ def hybrid_search_knowledge_base(query: str, top_k: int = 10, alpha: float = 0.6
         "best_doc": best_doc,
         "best_doc_prior": best_doc_prior,
         "user_intent": user_intent,
-        "user_intent_conf": intent_conf,
     }
-# --------------------------- Section fetch helpers ---------------------------
 def get_section_text(filename: str, section: str) -> str:
     """Concatenate all chunk texts for a given filename+section."""
     texts: List[str] = []
@@ -455,6 +524,7 @@ def get_section_text(filename: str, section: str) -> str:
                 texts.append(t)
     return "\n\n".join(texts).strip()
 def get_best_steps_section_text(filename: str) -> str:
     """Return combined text of all 'steps' sections in the given SOP (filename)."""
     texts: List[str] = []
@@ -466,7 +536,8 @@ def get_best_steps_section_text(filename: str) -> str:
                 texts.append(t)
     return "\n\n".join(texts).strip()
-# --------------------------- Admin helpers ---------------------------
 def get_kb_runtime_info() -> Dict[str, Any]:
     return {
         "chroma_path": CHROMA_PATH,
@@ -477,6 +548,7 @@ def get_kb_runtime_info() -> Dict[str, Any]:
         "bm25_ready": bm25_ready,
     }
 def reset_kb(folder_path: str) -> Dict[str, Any]:
     result = {"status": "OK", "message": "KB reset and re-ingested"}
     try:

 import os
 import re
 import pickle
 client = chromadb.PersistentClient(path=CHROMA_PATH)
 collection = client.get_or_create_collection(name="knowledge_base")
+# --------------------------- Embedding model --------------------------
 model = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2")
+# --------------------------- BM25 (lightweight) -----------------------
 BM25_INDEX_FILE = os.path.join(CHROMA_PATH, "bm25_index.pkl")
 bm25_docs: List[Dict[str, Any]] = []
 bm25_inverted: Dict[str, List[int]] = {}
 BM25_K1 = 1.5
 BM25_B = 0.75
+# --------------------------- Utilities --------------------------------
 def _tokenize(text: str) -> List[str]:
     if not text:
         return []
     text = text.lower()
     return re.findall(r"[a-z0-9]+", text)
 def _normalize_query(q: str) -> str:
     q = (q or "").strip().lower()
     q = re.sub(r"[^\w\s]", " ", q)
+    # remove filler issue words
     q = re.sub(
         r"\b(facing|get|getting|got|seeing|receiving|encountered|having|observing|issue|problem)\b",
         " ",
     q = re.sub(r"\s+", " ", q).strip()
     return q
 def _tokenize_meta_value(val: Optional[str]) -> List[str]:
     return _tokenize(val or "")
+# ---------------------- DOCX parsing & chunking -----------------------
 def _split_by_sections(doc: Document) -> List[Tuple[str, List[str]]]:
     sections: List[Tuple[str, List[str]]] = []
     current_title = None
         sections = [("Document", all_text)]
     return sections
 def _chunk_text_with_context(doc_title: str, section_title: str, paragraphs: List[str], max_words: int = 900) -> List[str]:
     body = "\n".join(paragraphs).strip()
     if not body:
         return []
         chunks = [body]
     return chunks
+# ---------------------- Intent tagging (section-based) ----------------
+def _infer_intent_tag(section_title: str) -> str:
+    st = (section_title or "").lower()
+    if any(k in st for k in ["process steps", "procedure", "how to", "workflow", "instructions"]):
+        return "steps"
+    if any(k in st for k in ["common errors", "resolution", "troubleshooting"]):
+        return "errors"
+    if any(k in st for k in ["pre-requisites", "prerequisites"]):
+        return "prereqs"
+    if any(k in st for k in ["purpose", "overview", "introduction"]):
+        return "purpose"
+    return "neutral"
+# ---------------------- Ingestion ------------------------------------
 def ingest_documents(folder_path: str) -> None:
     print(f"📂 Checking folder: {folder_path}")
     files = [f for f in os.listdir(folder_path) if f.lower().endswith('.docx')]
         doc = Document(file_path)
         sections = _split_by_sections(doc)
         total_chunks = 0
         for s_idx, (section_title, paras) in enumerate(sections):
             chunks = _chunk_text_with_context(doc_title, section_title, paras, max_words=900)
             total_chunks += len(chunks)
+            intent_tag = _infer_intent_tag(section_title)
             for c_idx, chunk in enumerate(chunks):
                 embedding = model.encode(chunk).tolist()
                 doc_id = f"{file}:{s_idx}:{c_idx}"
                     "chunk_index": c_idx,
                     "title": doc_title,
                     "collection": "SOP",
+                    "intent_tag": intent_tag,
                 }
                 try:
                     collection.add(ids=[doc_id], embeddings=[embedding], documents=[chunk], metadatas=[meta])
                     except Exception as e2:
                         print(f"❌ Upsert failed for {doc_id}: {e2}")
                 tokens = _tokenize(chunk)
                 tf: Dict[str, int] = {}
                 for t in tokens:
                     tf[t] = tf.get(t, 0) + 1
                 idx = len(bm25_docs)
                 bm25_docs.append({"id": doc_id, "text": chunk, "tokens": tokens, "tf": tf, "length": len(tokens), "meta": meta})
                 seen = set()
                 for term in tf.keys():
                     bm25_inverted.setdefault(term, []).append(idx)
                     if term not in seen:
                         bm25_df[term] = bm25_df.get(term, 0) + 1
                         seen.add(term)
         print(f"📄 Ingested {file} → {total_chunks} chunks")
     N = len(bm25_docs)
     if N > 0:
         bm25_avgdl = sum(d["length"] for d in bm25_docs) / float(N)
         bm25_ready = True
     payload = {
         "bm25_docs": bm25_docs,
         "bm25_inverted": bm25_inverted,
     print(f"✅ BM25 index saved: {BM25_INDEX_FILE}")
     print(f"✅ Documents ingested. Total entries in Chroma: {collection.count()}")
 def _load_bm25_index() -> None:
     global bm25_docs, bm25_inverted, bm25_df, bm25_avgdl, bm25_ready
     if not os.path.exists(BM25_INDEX_FILE):
     except Exception as e:
         print(f"⚠️ Could not load BM25 index: {e}")
 _load_bm25_index()
+# ---------------------- BM25 search ----------------------------------
 def _bm25_score_for_doc(query_terms: List[str], doc_idx: int) -> float:
     if not bm25_ready or doc_idx < 0 or doc_idx >= len(bm25_docs):
         return 0.0
         score += idf * ((tf * (BM25_K1 + 1)) / (denom or 1.0))
     return score
 def bm25_search(query: str, top_k: int = 50) -> List[Tuple[int, float]]:
     if not bm25_ready:
         return []
     scored.sort(key=lambda x: x[1], reverse=True)
     return scored[:top_k]
+# ---------------------- Semantic-only --------------------------------
 def search_knowledge_base(query: str, top_k: int = 10) -> dict:
     query_embedding = model.encode(query).tolist()
     res = collection.query(
         query_embeddings=[query_embedding],
         n_results=top_k,
+        include=['documents', 'metadatas', 'distances', 'ids']
     )
+    documents = (res.get("documents", [[]]) or [[]])[0]
+    metadatas = (res.get("metadatas", [[]]) or [[]])[0]
+    distances = (res.get("distances", [[]]) or [[]])[0]
+    ids = (res.get("ids", [[]]) or [[]])[0]
     return {
         "documents": documents,
         "metadatas": metadatas,
         "ids": ids,
     }
+# ---------------------- Semantic intent + Hybrid ranking --------------
+# Semantic intent prototypes (generic, wording-agnostic)
+INTENT_PROTOTYPES = {
+    "steps": [
+        "how to perform", "procedure", "workflow", "instructions",
+        "steps to accomplish", "operate", "process to follow"
+    ],
+    "errors": [
+        "error condition", "issue troubleshooting", "resolution steps",
+        "fix failure", "diagnose problem"
+    ],
+    "prereqs": [
+        "pre-requisites", "requirements before starting", "setup needed"
+    ],
+    "purpose": [
+        "overview", "purpose", "introduction", "what is this about"
+    ],
+    "neutral": ["general information", "context", "details"],
+}
+INTENT_PROTO_VECS = {name: model.encode(" ; ".join(phrases)).tolist() for name, phrases in INTENT_PROTOTYPES.items()}
+def _cosine(a: list, b: list) -> float:
+    if not a or not b or len(a) != len(b):
+        return 0.0
+    dot = sum(x * y for x, y in zip(a, b))
+    na = math.sqrt(sum(x * x for x in a)) or 1.0
+    nb = math.sqrt(sum(y * y for y in b)) or 1.0
+    return dot / (na * nb)
+def classify_intent_semantic(query: str, min_margin: float = 0.08) -> str:
+    """Meaning-based intent classification using sentence embeddings."""
+    qv = model.encode((query or "").strip()).tolist()
+    scores = {name: _cosine(qv, vec) for name, vec in INTENT_PROTO_VECS.items()}
+    best = max(scores.items(), key=lambda kv: kv[1])
+    second = sorted(scores.values(), reverse=True)[1] if len(scores) > 1 else 0.0
+    if best[1] - second >= min_margin:
+        return best[0] if best[0] != "neutral" else "neutral"
+    return "neutral"
+ACTION_SYNONYMS = {
+    "create": ["create", "creation", "add", "new", "generate"],
+    "update": ["update", "modify", "change", "edit"],
+    "delete": ["delete", "remove"],
+    "navigate": ["navigate", "go to", "open"],
+}
+def _extract_actions(query: str) -> List[str]:
+    q = (query or "").lower()
+    found = []
+    for act, syns in ACTION_SYNONYMS.items():
+        if any(s in q for s in syns):
+            found.append(act)
+    return found or []
+def _intent_weight(meta: dict, user_intent: str) -> float:
+    tag = (meta or {}).get("intent_tag", "neutral")
+    if user_intent == "neutral":
+        return 0.0
+    if tag == user_intent:
+        return 1.0
+    if tag in ["purpose", "prereqs"] and user_intent in ["steps", "errors"]:
+        return -0.6
+    return -0.2
 def _meta_overlap(meta: Dict[str, Any], q_terms: List[str]) -> float:
     fn_tokens = _tokenize_meta_value(meta.get("filename"))
     title_tokens = _tokenize_meta_value(meta.get("title"))
     inter = len(meta_tokens & qset)
     return inter / max(1, len(qset))
+def _semantic_meta_overlap(meta: Dict[str, Any], query_vec: List[float]) -> float:
+    """Compare query vector to semantic vector of filename/title/section."""
+    if not meta:
+        return 0.0
+    s = " ".join([str(meta.get("filename", "")), str(meta.get("title", "")), str(meta.get("section", ""))]).strip()
+    if not s:
+        return 0.0
+    mv = model.encode(s).tolist()
+    return max(0.0, _cosine(query_vec, mv))
+def _action_weight(text: str, actions: List[str]) -> float:
+    if not actions:
+        return 0.0
+    t = (text or "").lower()
+    score = 0.0
+    for act in actions:
+        for syn in ACTION_SYNONYMS.get(act, [act]):
+            if syn in t:
+                score += 1.0
+    conflicts = {"create": ["delete"], "delete": ["create"], "update": ["delete"], "navigate": []}
+    for act in actions:
+        for bad in conflicts.get(act, []):
+            for syn in ACTION_SYNONYMS.get(bad, [bad]):
+                if syn in t:
+                    score -= 0.8
+    return score
 def hybrid_search_knowledge_base(query: str, top_k: int = 10, alpha: float = 0.6, beta: float = 0.4) -> dict:
     norm_query = _normalize_query(query)
     q_terms = _tokenize(norm_query)
+    # semantic intent
+    user_intent = classify_intent_semantic(query)
+    actions = _extract_actions(query)
+    # query vector
+    query_vec = model.encode(norm_query).tolist()
+    # semantic results
     sem_res = search_knowledge_base(norm_query, top_k=max(top_k, 30))
     sem_docs = sem_res.get("documents", [])
     sem_metas = sem_res.get("metadatas", [])
     sem_sims = [dist_to_sim(d) for d in sem_dists]
+    # bm25 results
     bm25_hits = bm25_search(norm_query, top_k=max(50, top_k * 5))
     bm25_max = max([s for _, s in bm25_hits], default=1.0)
     bm25_norm_pairs = [(idx, (score / bm25_max) if bm25_max > 0 else 0.0) for idx, score in bm25_hits]
     bm25_id_to_norm, bm25_id_to_text, bm25_id_to_meta = {}, {}, {}
     for idx, nscore in bm25_norm_pairs:
         d = bm25_docs[idx]
     union_ids = set(sem_ids) | set(bm25_id_to_norm.keys())
+    # weights
+    gamma = 0.25  # lexical meta overlap
+    delta = 0.35  # intent boost
+    epsilon = 0.25  # action weight
+    zeta = 0.35    # semantic meta similarity
+    combined_records_ext: List[Tuple[str, float, float, str, Dict[str, Any], float, float, float, float]] = []
     for cid in union_ids:
         if cid in sem_ids:
         text = sem_text if sem_text else bm25_text
         meta = sem_meta if sem_meta else bm25_meta
+        m_overlap = _meta_overlap(meta, q_terms)             # lexical overlap
+        m_sem = _semantic_meta_overlap(meta, query_vec)      # semantic overlap
+        intent_boost = _intent_weight(meta, user_intent)
+        act_wt = _action_weight(text, actions)
+        final_score = (
+            alpha * sem_sim +
+            beta * bm25_sim +
+            gamma * m_overlap +
+            zeta * m_sem +
+            delta * intent_boost +
+            epsilon * act_wt
+        )
         combined_records_ext.append(
+            (cid, final_score, (sem_dist if sem_dist is not None else 999.0), text, meta, m_overlap, intent_boost, act_wt, m_sem)
         )
     from collections import defaultdict
+    doc_groups: Dict[str, List[Tuple[str, float, float, str, Dict[str, Any], float, float, float, float]]] = defaultdict(list)
     for rec in combined_records_ext:
         meta = rec[4] or {}
         fn = meta.get("filename", "unknown")
         doc_groups[fn].append(rec)
+    def doc_prior(recs: List[Tuple[str, float, float, str, Dict[str, Any], float, float, float, float]]) -> float:
         total_score = sum(r[1] for r in recs)
         total_overlap = sum(r[5] for r in recs)
+        total_intent = sum(max(0.0, r[6]) for r in recs)
+        total_action = sum(max(0.0, r[7]) for r in recs)
+        total_sem_meta = sum(r[8] for r in recs)
+        total_penalty = sum(min(0.0, r[6]) for r in recs) + sum(min(0.0, r[7]) for r in recs)
+        return total_score + 0.4 * total_overlap + 0.6 * total_intent + 0.5 * total_action + 0.6 * total_sem_meta + 0.3 * total_penalty
     best_doc, best_doc_prior = None, -1.0
     for fn, recs in doc_groups.items():
             continue
         other_recs.extend(recs)
     other_recs.sort(key=lambda x: x[1], reverse=True)
     reordered = best_recs + other_recs
     top = reordered[:top_k]
     documents = [t[3] for t in top]
     metadatas = [t[4] for t in top]
     distances = [t[2] for t in top]
         "best_doc": best_doc,
         "best_doc_prior": best_doc_prior,
         "user_intent": user_intent,
+        "actions": actions,
     }
+# ---------------------- Section fetch helpers -------------------------
 def get_section_text(filename: str, section: str) -> str:
     """Concatenate all chunk texts for a given filename+section."""
     texts: List[str] = []
                 texts.append(t)
     return "\n\n".join(texts).strip()
 def get_best_steps_section_text(filename: str) -> str:
     """Return combined text of all 'steps' sections in the given SOP (filename)."""
     texts: List[str] = []
                 texts.append(t)
     return "\n\n".join(texts).strip()
+# ---------------------- Admin helpers --------------------------------
 def get_kb_runtime_info() -> Dict[str, Any]:
     return {
         "chroma_path": CHROMA_PATH,
         "bm25_ready": bm25_ready,
     }
 def reset_kb(folder_path: str) -> Dict[str, Any]:
     result = {"status": "OK", "message": "KB reset and re-ingested"}
     try: