Spaces:

ChatbotNova
/

Chatbot-Backend

Sleeping

App Files Files Community

srilakshu012456 commited on Dec 24, 2025

Commit

c6f40fa

verified ·

1 Parent(s): 5e31c75

Update services/kb_creation.py

Browse files

Files changed (1) hide show

services/kb_creation.py +189 -141

services/kb_creation.py CHANGED Viewed

@@ -1,33 +1,20 @@
-#!/usr/bin/env python3
-# -*- coding: utf-8 -*-
-"""
-services/kb_creation.py
-Generic, meaning-aware intent & ranking:
-- Semantic intent classification (no keyword rules).
-- Hybrid score = semantic similarity + BM25 + lexical meta overlap + semantic meta overlap.
-- Chroma 'include' excludes 'ids'; IDs synthesized from metadata.
-"""
 import os
 import re
 import pickle
-import math
 from typing import List, Dict, Any, Tuple, Optional
 from docx import Document
 from sentence_transformers import SentenceTransformer
 import chromadb
-# -------- ChromaDB --------
 CHROMA_PATH = os.path.join(os.getcwd(), "chroma_db")
 client = chromadb.PersistentClient(path=CHROMA_PATH)
 collection = client.get_or_create_collection(name="knowledge_base")
-# -------- Embeddings --------
 model = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2")
-# -------- BM25 (lightweight) --------
 BM25_INDEX_FILE = os.path.join(CHROMA_PATH, "bm25_index.pkl")
 bm25_docs: List[Dict[str, Any]] = []
 bm25_inverted: Dict[str, List[int]] = {}
@@ -37,26 +24,32 @@ bm25_ready: bool = False
 BM25_K1 = 1.5
 BM25_B = 0.75
-# -------- Utilities --------
 def _tokenize(text: str) -> List[str]:
-    if not text: return []
     text = text.lower()
     return re.findall(r"[a-z0-9]+", text)
 def _normalize_query(q: str) -> str:
     q = (q or "").strip().lower()
     q = re.sub(r"[^\w\s]", " ", q)
-    q = re.sub(r"\b(facing|get|getting|got|seeing|receiving|encountered|having|observing|issue|problem)\b", " ", q)
     q = re.sub(r"\s+", " ", q).strip()
     return q
 def _tokenize_meta_value(val: Optional[str]) -> List[str]:
     return _tokenize(val or "")
-# -------- DOCX parsing & chunking --------
 def _split_by_sections(doc: Document) -> List[Tuple[str, List[str]]]:
     sections: List[Tuple[str, List[str]]] = []
-    current_title, current_paras = None, []
     for para in doc.paragraphs:
         text = (para.text or "").strip()
         style_name = (para.style.name if para.style else "") or ""
@@ -64,7 +57,8 @@ def _split_by_sections(doc: Document) -> List[Tuple[str, List[str]]]:
         if is_heading and text:
             if current_title or current_paras:
                 sections.append((current_title or "Untitled Section", current_paras))
-            current_title, current_paras = text, []
         else:
             if text:
                 current_paras.append(text)
@@ -77,29 +71,39 @@ def _split_by_sections(doc: Document) -> List[Tuple[str, List[str]]]:
 def _chunk_text_with_context(doc_title: str, section_title: str, paragraphs: List[str], max_words: int = 900) -> List[str]:
     body = "\n".join(paragraphs).strip()
-    if not body: return []
     words = body.split()
     chunks: List[str] = []
     for i in range(0, len(words), max_words):
         chunk_body = ' '.join(words[i:i + max_words]).strip()
         if chunk_body:
-            chunks.append(chunk_body)
-    if not chunks: chunks = [body]
     return chunks
-# -------- Intent tag from section (for metadata only) --------
 def _infer_intent_tag(section_title: str) -> str:
     st = (section_title or "").lower()
-    if any(k in st for k in ["process steps", "procedure", "how to", "workflow", "instructions"]): return "steps"
-    if any(k in st for k in ["common errors", "resolution", "troubleshooting"]): return "errors"
-    if any(k in st for k in ["pre-requisites", "prerequisites"]): return "prereqs"
-    if any(k in st for k in ["purpose", "overview", "introduction"]): return "purpose"
     return "neutral"
-# -------- Ingestion --------
 def ingest_documents(folder_path: str) -> None:
     files = [f for f in os.listdir(folder_path) if f.lower().endswith('.docx')]
-    if not files: return
     global bm25_docs, bm25_inverted, bm25_df, bm25_avgdl, bm25_ready
     bm25_docs, bm25_inverted, bm25_df = [], {}, {}
@@ -110,8 +114,11 @@ def ingest_documents(folder_path: str) -> None:
         doc_title = os.path.splitext(file)[0]
         doc = Document(file_path)
         sections = _split_by_sections(doc)
         for s_idx, (section_title, paras) in enumerate(sections):
             chunks = _chunk_text_with_context(doc_title, section_title, paras, max_words=900)
             intent_tag = _infer_intent_tag(section_title)
             for c_idx, chunk in enumerate(chunks):
                 embedding = model.encode(chunk).tolist()
@@ -122,7 +129,7 @@ def ingest_documents(folder_path: str) -> None:
                     "chunk_index": c_idx,
                     "title": doc_title,
                     "collection": "SOP",
-                    "intent_tag": intent_tag,
                 }
                 try:
                     collection.add(ids=[doc_id], embeddings=[embedding], documents=[chunk], metadatas=[meta])
@@ -130,14 +137,16 @@ def ingest_documents(folder_path: str) -> None:
                     try:
                         collection.delete(ids=[doc_id])
                         collection.add(ids=[doc_id], embeddings=[embedding], documents=[chunk], metadatas=[meta])
-                    except Exception:
-                        pass
                 tokens = _tokenize(chunk)
                 tf: Dict[str, int] = {}
-                for t in tokens: tf[t] = tf.get(t, 0) + 1
                 idx = len(bm25_docs)
                 bm25_docs.append({"id": doc_id, "text": chunk, "tokens": tokens, "tf": tf, "length": len(tokens), "meta": meta})
                 seen = set()
                 for term in tf.keys():
                     bm25_inverted.setdefault(term, []).append(idx)
@@ -145,20 +154,31 @@ def ingest_documents(folder_path: str) -> None:
                         bm25_df[term] = bm25_df.get(term, 0) + 1
                         seen.add(term)
     N = len(bm25_docs)
     if N > 0:
         bm25_avgdl = sum(d["length"] for d in bm25_docs) / float(N)
         bm25_ready = True
-    payload = {"bm25_docs": bm25_docs, "bm25_inverted": bm25_inverted, "bm25_df": bm25_df,
-               "bm25_avgdl": bm25_avgdl, "BM25_K1": BM25_K1, "BM25_B": BM25_B}
     os.makedirs(CHROMA_PATH, exist_ok=True)
     with open(BM25_INDEX_FILE, "wb") as f:
         pickle.dump(payload, f)
 def _load_bm25_index() -> None:
     global bm25_docs, bm25_inverted, bm25_df, bm25_avgdl, bm25_ready
-    if not os.path.exists(BM25_INDEX_FILE): return
     try:
         with open(BM25_INDEX_FILE, "rb") as f:
             payload = pickle.load(f)
@@ -167,114 +187,132 @@ def _load_bm25_index() -> None:
         bm25_df = payload.get("bm25_df", {})
         bm25_avgdl = payload.get("bm25_avgdl", 0.0)
         bm25_ready = len(bm25_docs) > 0
-    except Exception:
-        pass
 _load_bm25_index()
-# -------- BM25 search --------
 def _bm25_score_for_doc(query_terms: List[str], doc_idx: int) -> float:
-    if not bm25_ready or doc_idx < 0 or doc_idx >= len(bm25_docs): return 0.0
     doc = bm25_docs[doc_idx]
-    score, dl = 0.0, (doc["length"] or 1)
     for term in query_terms:
         df = bm25_df.get(term, 0)
-        if df == 0: continue
         tf = doc["tf"].get(term, 0)
-        if tf == 0: continue
         N = len(bm25_docs)
         idf_ratio = ((N - df + 0.5) / (df + 0.5))
-        try: idf = math.log(idf_ratio + 1.0)
-        except Exception: idf = 1.0
         denom = tf + BM25_K1 * (1 - BM25_B + BM25_B * (dl / (bm25_avgdl or 1.0)))
         score += idf * ((tf * (BM25_K1 + 1)) / (denom or 1.0))
     return score
 def bm25_search(query: str, top_k: int = 50) -> List[Tuple[int, float]]:
-    if not bm25_ready: return []
     norm = _normalize_query(query)
     q_terms = _tokenize(norm)
-    if not q_terms: return []
     candidates = set()
     for t in q_terms:
-        for idx in bm25_inverted.get(t, []): candidates.add(idx)
-    if not candidates: candidates = set(range(len(bm25_docs)))
     scored = []
     for idx in candidates:
         s = _bm25_score_for_doc(q_terms, idx)
-        if s > 0: scored.append((idx, s))
     scored.sort(key=lambda x: x[1], reverse=True)
     return scored[:top_k]
-# -------- SAFE Chroma query and semantic-only --------
-def _safe_collection_query(query_embedding, top_k: int):
-    base_include = ['documents', 'metadatas', 'distances']  # supported
-    return collection.query(query_embeddings=[query_embedding], n_results=top_k, include=base_include)
 def search_knowledge_base(query: str, top_k: int = 10) -> dict:
     query_embedding = model.encode(query).tolist()
-    res = _safe_collection_query(query_embedding, top_k)
-    documents = (res.get("documents", [[]]) or [[]])[0]
-    metadatas = (res.get("metadatas", [[]]) or [[]])[0]
-    distances = (res.get("distances", [[]]) or [[]])[0]
-    # synthesize ids
-    ids = []
-    for i, m in enumerate(metadatas):
-        fn = (m or {}).get("filename", "unknown")
-        sec = (m or {}).get("section", "section")
-        idx = (m or {}).get("chunk_index", i)
-        ids.append(f"{fn}:{sec}:{idx}")
-    return {"documents": documents, "metadatas": metadatas, "distances": distances, "ids": ids}
-# -------- Semantic intent + Hybrid ranking --------
-INTENT_PROTOTYPES = {
-    "steps": ["how to perform", "procedure", "workflow", "instructions",
-        "steps to accomplish", "operate", "process to follow"],
-    "errors": ["error condition", "issue troubleshooting", "resolution steps",
-        "fix failure", "diagnose problem",
-        # ✅ NEW: permission/authorization/role access signals
-        "permission denied", "not authorized", "authorization required",
-        "role access missing", "access not allowed", "insufficient privileges"],
-    "prereqs": ["pre-requisites", "requirements before starting", "setup needed"],
-    "purpose": ["overview", "purpose", "introduction", "what is this about"],
-    "neutral": ["general information", "context", "details"],
-}
-INTENT_PROTO_VECS = {name: model.encode(" ; ".join(phrases)).tolist() for name, phrases in INTENT_PROTOTYPES.items()}
-def _cosine(a: list, b: list) -> float:
-    if not a or not b or len(a) != len(b): return 0.0
-    dot = sum(x*y for x, y in zip(a, b))
-    na = math.sqrt(sum(x*x for x in a)) or 1.0
-    nb = math.sqrt(sum(y*y for y in b)) or 1.0
-    return dot / (na * nb)
-def classify_intent_semantic(query: str, min_margin: float = 0.08) -> str:
-    qv = model.encode((query or "").strip()).tolist()
-    scores = {name: _cosine(qv, vec) for name, vec in INTENT_PROTO_VECS.items()}
-    best = max(scores.items(), key=lambda kv: kv[1])
-    second = sorted(scores.values(), reverse=True)[1] if len(scores) > 1 else 0.0
-    if best[1] - second >= min_margin: return best[0] if best[0] != "neutral" else "neutral"
-    return "neutral"
 ACTION_SYNONYMS = {
     "create": ["create", "creation", "add", "new", "generate"],
     "update": ["update", "modify", "change", "edit"],
     "delete": ["delete", "remove"],
     "navigate": ["navigate", "go to", "open"],
 }
 def _extract_actions(query: str) -> List[str]:
     q = (query or "").lower()
     found = []
     for act, syns in ACTION_SYNONYMS.items():
-        if any(s in q for s in syns): found.append(act)
     return found or []
 def _intent_weight(meta: dict, user_intent: str) -> float:
     tag = (meta or {}).get("intent_tag", "neutral")
-    if user_intent == "neutral": return 0.0
-    if tag == user_intent: return 1.0
-    if tag in ["purpose", "prereqs"] and user_intent in ["steps", "errors"]: return -0.6
     return -0.2
 def _meta_overlap(meta: Dict[str, Any], q_terms: List[str]) -> float:
@@ -282,38 +320,34 @@ def _meta_overlap(meta: Dict[str, Any], q_terms: List[str]) -> float:
     title_tokens = _tokenize_meta_value(meta.get("title"))
     section_tokens = _tokenize_meta_value(meta.get("section"))
     meta_tokens = set(fn_tokens + title_tokens + section_tokens)
-    if not meta_tokens or not q_terms: return 0.0
     qset = set(q_terms)
     inter = len(meta_tokens & qset)
     return inter / max(1, len(qset))
-def _semantic_meta_overlap(meta: Dict[str, Any], query_vec: List[float]) -> float:
-    s = " ".join([str(meta.get("filename", "")), str(meta.get("title", "")), str(meta.get("section", ""))]).strip()
-    if not s: return 0.0
-    mv = model.encode(s).tolist()
-    return max(0.0, _cosine(query_vec, mv))
 def _action_weight(text: str, actions: List[str]) -> float:
-    if not actions: return 0.0
     t = (text or "").lower()
     score = 0.0
     for act in actions:
         for syn in ACTION_SYNONYMS.get(act, [act]):
-            if syn in t: score += 1.0
     conflicts = {"create": ["delete"], "delete": ["create"], "update": ["delete"], "navigate": []}
     for act in actions:
         for bad in conflicts.get(act, []):
             for syn in ACTION_SYNONYMS.get(bad, [bad]):
-                if syn in t: score -= 0.8
     return score
 def hybrid_search_knowledge_base(query: str, top_k: int = 10, alpha: float = 0.6, beta: float = 0.4) -> dict:
     norm_query = _normalize_query(query)
     q_terms = _tokenize(norm_query)
-    user_intent = classify_intent_semantic(query)            # semantic intent
     actions = _extract_actions(query)
-    query_vec = model.encode(norm_query).tolist()
     sem_res = search_knowledge_base(norm_query, top_k=max(top_k, 30))
     sem_docs = sem_res.get("documents", [])
@@ -322,15 +356,19 @@ def hybrid_search_knowledge_base(query: str, top_k: int = 10, alpha: float = 0.6
     sem_ids = sem_res.get("ids", [])
     def dist_to_sim(d: Optional[float]) -> float:
-        if d is None: return 0.0
-        try: return 1.0 / (1.0 + float(d))
-        except Exception: return 0.0
     sem_sims = [dist_to_sim(d) for d in sem_dists]
     bm25_hits = bm25_search(norm_query, top_k=max(50, top_k * 5))
     bm25_max = max([s for _, s in bm25_hits], default=1.0)
     bm25_norm_pairs = [(idx, (score / bm25_max) if bm25_max > 0 else 0.0) for idx, score in bm25_hits]
     bm25_id_to_norm, bm25_id_to_text, bm25_id_to_meta = {}, {}, {}
     for idx, nscore in bm25_norm_pairs:
         d = bm25_docs[idx]
@@ -340,13 +378,11 @@ def hybrid_search_knowledge_base(query: str, top_k: int = 10, alpha: float = 0.6
     union_ids = set(sem_ids) | set(bm25_id_to_norm.keys())
-    # weights
-    gamma = 0.25      # lexical meta overlap
-    delta = 0.35      # intent tag vs user intent
-    epsilon = 0.25    # action weight
-    zeta = 0.35       # semantic meta similarity (NEW)
-    combined_records_ext: List[Tuple[str, float, float, str, Dict[str, Any], float, float, float, float]] = []
     for cid in union_ids:
         if cid in sem_ids:
             pos = sem_ids.index(cid)
@@ -365,28 +401,29 @@ def hybrid_search_knowledge_base(query: str, top_k: int = 10, alpha: float = 0.6
         meta = sem_meta if sem_meta else bm25_meta
         m_overlap = _meta_overlap(meta, q_terms)
-        m_sem = _semantic_meta_overlap(meta, query_vec)     # NEW semantic meta
         intent_boost = _intent_weight(meta, user_intent)
         act_wt = _action_weight(text, actions)
-        final_score = alpha * sem_sim + beta * bm25_sim + gamma * m_overlap + zeta * m_sem + delta * intent_boost + epsilon * act_wt
-        combined_records_ext.append((cid, final_score, (sem_dist if sem_dist is not None else 999.0), text, meta, m_overlap, intent_boost, act_wt, m_sem))
     from collections import defaultdict
-    doc_groups: Dict[str, List[Tuple[str, float, float, str, Dict[str, Any], float, float, float, float]]] = defaultdict(list)
     for rec in combined_records_ext:
         meta = rec[4] or {}
         fn = meta.get("filename", "unknown")
         doc_groups[fn].append(rec)
-    def doc_prior(recs: List[Tuple[str, float, float, str, Dict[str, Any], float, float, float, float]]) -> float:
         total_score = sum(r[1] for r in recs)
         total_overlap = sum(r[5] for r in recs)
         total_intent = sum(max(0.0, r[6]) for r in recs)
         total_action = sum(max(0.0, r[7]) for r in recs)
-        total_sem_meta = sum(r[8] for r in recs)
         total_penalty = sum(min(0.0, r[6]) for r in recs) + sum(min(0.0, r[7]) for r in recs)
-        return total_score + 0.4 * total_overlap + 0.6 * total_intent + 0.5 * total_action + 0.6 * total_sem_meta + 0.3 * total_penalty
     best_doc, best_doc_prior = None, -1.0
     for fn, recs in doc_groups.items():
@@ -397,11 +434,14 @@ def hybrid_search_knowledge_base(query: str, top_k: int = 10, alpha: float = 0.6
     best_recs = sorted(doc_groups.get(best_doc, []), key=lambda x: x[1], reverse=True)
     other_recs = []
     for fn, recs in doc_groups.items():
-        if fn == best_doc: continue
         other_recs.extend(recs)
     other_recs.sort(key=lambda x: x[1], reverse=True)
     reordered = best_recs + other_recs
     top = reordered[:top_k]
     documents = [t[3] for t in top]
     metadatas = [t[4] for t in top]
     distances = [t[2] for t in top]
@@ -420,26 +460,30 @@ def hybrid_search_knowledge_base(query: str, top_k: int = 10, alpha: float = 0.6
         "actions": actions,
     }
-# -------- Section helpers --------
 def get_section_text(filename: str, section: str) -> str:
     texts: List[str] = []
     for d in bm25_docs:
         m = d.get("meta", {})
         if m.get("filename") == filename and m.get("section") == section:
             t = (d.get("text") or "").strip()
-            if t: texts.append(t)
     return "\n\n".join(texts).strip()
 def get_best_steps_section_text(filename: str) -> str:
     texts: List[str] = []
     for d in bm25_docs:
         m = d.get("meta", {})
         if m.get("filename") == filename and (m.get("intent_tag") == "steps"):
             t = (d.get("text") or "").strip()
-            if t: texts.append(t)
     return "\n\n".join(texts).strip()
-# -------- Admin --------
 def get_kb_runtime_info() -> Dict[str, Any]:
     return {
         "chroma_path": CHROMA_PATH,
@@ -453,12 +497,15 @@ def get_kb_runtime_info() -> Dict[str, Any]:
 def reset_kb(folder_path: str) -> Dict[str, Any]:
     result = {"status": "OK", "message": "KB reset and re-ingested"}
     try:
-        try: client.delete_collection(name="knowledge_base")
-        except Exception: pass
         global collection
         collection = client.get_or_create_collection(name="knowledge_base")
         try:
-            if os.path.isfile(BM25_INDEX_FILE): os.remove(BM25_INDEX_FILE)
         except Exception as e:
             result.setdefault("warnings", []).append(f"bm25 index delete: {e}")
         os.makedirs(CHROMA_PATH, exist_ok=True)
@@ -467,3 +514,4 @@ def reset_kb(folder_path: str) -> Dict[str, Any]:
         return result
     except Exception as e:
         return {"status": "ERROR", "error": f"{e}", "info": get_kb_runtime_info()}

 import os
 import re
 import pickle
 from typing import List, Dict, Any, Tuple, Optional
 from docx import Document
 from sentence_transformers import SentenceTransformer
 import chromadb
+# --------------------------- ChromaDB setup ---------------------------
 CHROMA_PATH = os.path.join(os.getcwd(), "chroma_db")
 client = chromadb.PersistentClient(path=CHROMA_PATH)
 collection = client.get_or_create_collection(name="knowledge_base")
+# --------------------------- Embedding model ---------------------------
 model = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2")
+# --------------------------- BM25 (lightweight) ---------------------------
 BM25_INDEX_FILE = os.path.join(CHROMA_PATH, "bm25_index.pkl")
 bm25_docs: List[Dict[str, Any]] = []
 bm25_inverted: Dict[str, List[int]] = {}
 BM25_K1 = 1.5
 BM25_B = 0.75
+# --------------------------- Utilities ---------------------------
 def _tokenize(text: str) -> List[str]:
+    if not text:
+        return []
     text = text.lower()
     return re.findall(r"[a-z0-9]+", text)
 def _normalize_query(q: str) -> str:
     q = (q or "").strip().lower()
     q = re.sub(r"[^\w\s]", " ", q)
+    q = re.sub(
+        r"\b(facing|get|getting|got|seeing|receiving|encountered|having|observing|issue|problem)\b",
+        " ",
+        q,
+    )
     q = re.sub(r"\s+", " ", q).strip()
     return q
 def _tokenize_meta_value(val: Optional[str]) -> List[str]:
     return _tokenize(val or "")
+# --------------------------- DOCX parsing & chunking ---------------------------
 def _split_by_sections(doc: Document) -> List[Tuple[str, List[str]]]:
     sections: List[Tuple[str, List[str]]] = []
+    current_title = None
+    current_paras: List[str] = []
     for para in doc.paragraphs:
         text = (para.text or "").strip()
         style_name = (para.style.name if para.style else "") or ""
         if is_heading and text:
             if current_title or current_paras:
                 sections.append((current_title or "Untitled Section", current_paras))
+            current_title = text
+            current_paras = []
         else:
             if text:
                 current_paras.append(text)
 def _chunk_text_with_context(doc_title: str, section_title: str, paragraphs: List[str], max_words: int = 900) -> List[str]:
     body = "\n".join(paragraphs).strip()
+    if not body:
+        return []
     words = body.split()
     chunks: List[str] = []
     for i in range(0, len(words), max_words):
         chunk_body = ' '.join(words[i:i + max_words]).strip()
         if chunk_body:
+            chunks.append(chunk_body)  # no doc/section headers inside text
+    if not chunks:
+        chunks = [body]
     return chunks
+# --------------------------- Intent tagging (auto) ---------------------------
 def _infer_intent_tag(section_title: str) -> str:
     st = (section_title or "").lower()
+    if any(k in st for k in ["process steps", "procedure", "how to", "workflow", "instructions"]):
+        return "steps"
+    if any(k in st for k in ["common errors", "resolution", "troubleshooting"]):
+        return "errors"
+    if any(k in st for k in ["pre-requisites", "prerequisites"]):
+        return "prereqs"
+    if any(k in st for k in ["purpose", "overview", "introduction"]):
+        return "purpose"
     return "neutral"
+# --------------------------- Ingestion ---------------------------
 def ingest_documents(folder_path: str) -> None:
+    print(f"📂 Checking folder: {folder_path}")
     files = [f for f in os.listdir(folder_path) if f.lower().endswith('.docx')]
+    print(f"Found {len(files)} Word files: {files}")
+    if not files:
+        print("⚠️ No .docx files found. Please check the folder path.")
+        return
     global bm25_docs, bm25_inverted, bm25_df, bm25_avgdl, bm25_ready
     bm25_docs, bm25_inverted, bm25_df = [], {}, {}
         doc_title = os.path.splitext(file)[0]
         doc = Document(file_path)
         sections = _split_by_sections(doc)
+        total_chunks = 0
         for s_idx, (section_title, paras) in enumerate(sections):
             chunks = _chunk_text_with_context(doc_title, section_title, paras, max_words=900)
+            total_chunks += len(chunks)
             intent_tag = _infer_intent_tag(section_title)
             for c_idx, chunk in enumerate(chunks):
                 embedding = model.encode(chunk).tolist()
                     "chunk_index": c_idx,
                     "title": doc_title,
                     "collection": "SOP",
+                    "intent_tag": intent_tag,  # NEW
                 }
                 try:
                     collection.add(ids=[doc_id], embeddings=[embedding], documents=[chunk], metadatas=[meta])
                     try:
                         collection.delete(ids=[doc_id])
                         collection.add(ids=[doc_id], embeddings=[embedding], documents=[chunk], metadatas=[meta])
+                    except Exception as e2:
+                        print(f"❌ Upsert failed for {doc_id}: {e2}")
                 tokens = _tokenize(chunk)
                 tf: Dict[str, int] = {}
+                for t in tokens:
+                    tf[t] = tf.get(t, 0) + 1
                 idx = len(bm25_docs)
                 bm25_docs.append({"id": doc_id, "text": chunk, "tokens": tokens, "tf": tf, "length": len(tokens), "meta": meta})
                 seen = set()
                 for term in tf.keys():
                     bm25_inverted.setdefault(term, []).append(idx)
                         bm25_df[term] = bm25_df.get(term, 0) + 1
                         seen.add(term)
+        print(f"📄 Ingested {file} → {total_chunks} chunks")
     N = len(bm25_docs)
     if N > 0:
         bm25_avgdl = sum(d["length"] for d in bm25_docs) / float(N)
         bm25_ready = True
+    payload = {
+        "bm25_docs": bm25_docs,
+        "bm25_inverted": bm25_inverted,
+        "bm25_df": bm25_df,
+        "bm25_avgdl": bm25_avgdl,
+        "BM25_K1": BM25_K1,
+        "BM25_B": BM25_B,
+    }
     os.makedirs(CHROMA_PATH, exist_ok=True)
     with open(BM25_INDEX_FILE, "wb") as f:
         pickle.dump(payload, f)
+    print(f"✅ BM25 index saved: {BM25_INDEX_FILE}")
+    print(f"✅ Documents ingested. Total entries in Chroma: {collection.count()}")
 def _load_bm25_index() -> None:
     global bm25_docs, bm25_inverted, bm25_df, bm25_avgdl, bm25_ready
+    if not os.path.exists(BM25_INDEX_FILE):
+        return
     try:
         with open(BM25_INDEX_FILE, "rb") as f:
             payload = pickle.load(f)
         bm25_df = payload.get("bm25_df", {})
         bm25_avgdl = payload.get("bm25_avgdl", 0.0)
         bm25_ready = len(bm25_docs) > 0
+        if bm25_ready:
+            print(f"✅ BM25 index loaded: {BM25_INDEX_FILE} (docs={len(bm25_docs)})")
+    except Exception as e:
+        print(f"⚠️ Could not load BM25 index: {e}")
 _load_bm25_index()
+# --------------------------- BM25 search ---------------------------
 def _bm25_score_for_doc(query_terms: List[str], doc_idx: int) -> float:
+    if not bm25_ready or doc_idx < 0 or doc_idx >= len(bm25_docs):
+        return 0.0
     doc = bm25_docs[doc_idx]
+    score = 0.0
+    dl = doc["length"] or 1
     for term in query_terms:
         df = bm25_df.get(term, 0)
+        if df == 0:
+            continue
         tf = doc["tf"].get(term, 0)
+        if tf == 0:
+            continue
         N = len(bm25_docs)
         idf_ratio = ((N - df + 0.5) / (df + 0.5))
+        try:
+            import math
+            idf = math.log(idf_ratio + 1.0)
+        except Exception:
+            idf = 1.0
         denom = tf + BM25_K1 * (1 - BM25_B + BM25_B * (dl / (bm25_avgdl or 1.0)))
         score += idf * ((tf * (BM25_K1 + 1)) / (denom or 1.0))
     return score
 def bm25_search(query: str, top_k: int = 50) -> List[Tuple[int, float]]:
+    if not bm25_ready:
+        return []
     norm = _normalize_query(query)
     q_terms = _tokenize(norm)
+    if not q_terms:
+        return []
     candidates = set()
     for t in q_terms:
+        for idx in bm25_inverted.get(t, []):
+            candidates.add(idx)
+    if not candidates:
+        candidates = set(range(len(bm25_docs)))
     scored = []
     for idx in candidates:
         s = _bm25_score_for_doc(q_terms, idx)
+        if s > 0:
+            scored.append((idx, s))
     scored.sort(key=lambda x: x[1], reverse=True)
     return scored[:top_k]
+# --------------------------- Semantic-only ---------------------------
 def search_knowledge_base(query: str, top_k: int = 10) -> dict:
     query_embedding = model.encode(query).tolist()
+    res = collection.query(
+        query_embeddings=[query_embedding],
+        n_results=top_k,
+        include=['documents', 'metadatas', 'distances']
+    )
+    docs_ll = res.get("documents", [[]]) or [[]]
+    metas_ll = res.get("metadatas", [[]]) or [[]]
+    dists_ll = res.get("distances", [[]]) or [[]]
+    ids_ll = res.get("ids", [[]]) or [[]]
+    documents = docs_ll[0] if docs_ll else []
+    metadatas = metas_ll[0] if metas_ll else []
+    distances = dists_ll[0] if dists_ll else []
+    ids = ids_ll[0] if ids_ll else []
+    if not ids and documents:
+        synthesized = []
+        for i, m in enumerate(metadatas):
+            fn = (m or {}).get("filename", "unknown")
+            sec = (m or {}).get("section", "section")
+            idx = (m or {}).get("chunk_index", i)
+            synthesized.append(f"{fn}:{sec}:{idx}")
+        ids = synthesized
+    print(f"🔎 KB search → {len(documents)} docs (top_k={top_k}); "
+          f"first distance: {distances[0] if distances else 'n/a'}; ids={len(ids)}")
+    return {
+        "documents": documents,
+        "metadatas": metadatas,
+        "distances": distances,
+        "ids": ids,
+    }
+# --------------------------- Hybrid (BM25 + Embeddings + Intent + Action) ---------------------------
 ACTION_SYNONYMS = {
     "create": ["create", "creation", "add", "new", "generate"],
     "update": ["update", "modify", "change", "edit"],
     "delete": ["delete", "remove"],
     "navigate": ["navigate", "go to", "open"],
+    # NOTE: 'perform' REMOVED to avoid wrong boosts like Appointment "performed..."
 }
+def _detect_user_intent(query: str) -> str:
+    q = (query or "").lower()
+    if any(k in q for k in ["steps", "procedure", "how to", "navigate", "perform", "do", "process"]):
+        return "steps"
+    if any(k in q for k in ["error", "issue", "fail", "not working", "resolution", "fix"]):
+        return "errors"
+    if any(k in q for k in ["pre-requisite", "prerequisites", "requirement", "requirements"]):
+        return "prereqs"
+    if any(k in q for k in ["purpose", "overview", "introduction"]):
+        return "purpose"
+    return "neutral"
 def _extract_actions(query: str) -> List[str]:
     q = (query or "").lower()
     found = []
     for act, syns in ACTION_SYNONYMS.items():
+        if any(s in q for s in syns):
+            found.append(act)
     return found or []
 def _intent_weight(meta: dict, user_intent: str) -> float:
     tag = (meta or {}).get("intent_tag", "neutral")
+    if user_intent == "neutral":
+        return 0.0
+    if tag == user_intent:
+        return 1.0
+    if tag in ["purpose", "prereqs"] and user_intent in ["steps", "errors"]:
+        return -0.6
     return -0.2
 def _meta_overlap(meta: Dict[str, Any], q_terms: List[str]) -> float:
     title_tokens = _tokenize_meta_value(meta.get("title"))
     section_tokens = _tokenize_meta_value(meta.get("section"))
     meta_tokens = set(fn_tokens + title_tokens + section_tokens)
+    if not meta_tokens or not q_terms:
+        return 0.0
     qset = set(q_terms)
     inter = len(meta_tokens & qset)
     return inter / max(1, len(qset))
 def _action_weight(text: str, actions: List[str]) -> float:
+    if not actions:
+        return 0.0
     t = (text or "").lower()
     score = 0.0
     for act in actions:
         for syn in ACTION_SYNONYMS.get(act, [act]):
+            if syn in t:
+                score += 1.0
     conflicts = {"create": ["delete"], "delete": ["create"], "update": ["delete"], "navigate": []}
     for act in actions:
         for bad in conflicts.get(act, []):
             for syn in ACTION_SYNONYMS.get(bad, [bad]):
+                if syn in t:
+                    score -= 0.8
     return score
 def hybrid_search_knowledge_base(query: str, top_k: int = 10, alpha: float = 0.6, beta: float = 0.4) -> dict:
     norm_query = _normalize_query(query)
     q_terms = _tokenize(norm_query)
+    user_intent = _detect_user_intent(query)
     actions = _extract_actions(query)
     sem_res = search_knowledge_base(norm_query, top_k=max(top_k, 30))
     sem_docs = sem_res.get("documents", [])
     sem_ids = sem_res.get("ids", [])
     def dist_to_sim(d: Optional[float]) -> float:
+        if d is None:
+            return 0.0
+        try:
+            return 1.0 / (1.0 + float(d))
+        except Exception:
+            return 0.0
     sem_sims = [dist_to_sim(d) for d in sem_dists]
     bm25_hits = bm25_search(norm_query, top_k=max(50, top_k * 5))
     bm25_max = max([s for _, s in bm25_hits], default=1.0)
     bm25_norm_pairs = [(idx, (score / bm25_max) if bm25_max > 0 else 0.0) for idx, score in bm25_hits]
     bm25_id_to_norm, bm25_id_to_text, bm25_id_to_meta = {}, {}, {}
     for idx, nscore in bm25_norm_pairs:
         d = bm25_docs[idx]
     union_ids = set(sem_ids) | set(bm25_id_to_norm.keys())
+    gamma = 0.25  # meta overlap
+    delta = 0.35  # intent boost
+    epsilon = 0.30  # action weight
+    combined_records_ext: List[Tuple[str, float, float, str, Dict[str, Any], float, float, float]] = []
     for cid in union_ids:
         if cid in sem_ids:
             pos = sem_ids.index(cid)
         meta = sem_meta if sem_meta else bm25_meta
         m_overlap = _meta_overlap(meta, q_terms)
         intent_boost = _intent_weight(meta, user_intent)
         act_wt = _action_weight(text, actions)
+        final_score = alpha * sem_sim + beta * bm25_sim + gamma * m_overlap + delta * intent_boost + epsilon * act_wt
+        combined_records_ext.append(
+            (cid, final_score, (sem_dist if sem_dist is not None else 999.0), text, meta, m_overlap, intent_boost, act_wt)
+        )
     from collections import defaultdict
+    doc_groups: Dict[str, List[Tuple[str, float, float, str, Dict[str, Any], float, float, float]]] = defaultdict(list)
     for rec in combined_records_ext:
         meta = rec[4] or {}
         fn = meta.get("filename", "unknown")
         doc_groups[fn].append(rec)
+    def doc_prior(recs: List[Tuple[str, float, float, str, Dict[str, Any], float, float, float]]) -> float:
         total_score = sum(r[1] for r in recs)
         total_overlap = sum(r[5] for r in recs)
         total_intent = sum(max(0.0, r[6]) for r in recs)
         total_action = sum(max(0.0, r[7]) for r in recs)
         total_penalty = sum(min(0.0, r[6]) for r in recs) + sum(min(0.0, r[7]) for r in recs)
+        return total_score + 0.4 * total_overlap + 0.6 * total_intent + 0.5 * total_action + 0.3 * total_penalty
     best_doc, best_doc_prior = None, -1.0
     for fn, recs in doc_groups.items():
     best_recs = sorted(doc_groups.get(best_doc, []), key=lambda x: x[1], reverse=True)
     other_recs = []
     for fn, recs in doc_groups.items():
+        if fn == best_doc:
+            continue
         other_recs.extend(recs)
     other_recs.sort(key=lambda x: x[1], reverse=True)
     reordered = best_recs + other_recs
     top = reordered[:top_k]
     documents = [t[3] for t in top]
     metadatas = [t[4] for t in top]
     distances = [t[2] for t in top]
         "actions": actions,
     }
+# --------------------------- Section fetch helpers (for full output) ---------------------------
 def get_section_text(filename: str, section: str) -> str:
+    """Concatenate all chunk texts for a given filename+section."""
     texts: List[str] = []
     for d in bm25_docs:
         m = d.get("meta", {})
         if m.get("filename") == filename and m.get("section") == section:
             t = (d.get("text") or "").strip()
+            if t:
+                texts.append(t)
     return "\n\n".join(texts).strip()
 def get_best_steps_section_text(filename: str) -> str:
+    """Return combined text of all 'steps' sections in the given SOP (filename)."""
     texts: List[str] = []
     for d in bm25_docs:
         m = d.get("meta", {})
         if m.get("filename") == filename and (m.get("intent_tag") == "steps"):
             t = (d.get("text") or "").strip()
+            if t:
+                texts.append(t)
     return "\n\n".join(texts).strip()
+# --- Admin helpers (optional; unchanged) ---
 def get_kb_runtime_info() -> Dict[str, Any]:
     return {
         "chroma_path": CHROMA_PATH,
 def reset_kb(folder_path: str) -> Dict[str, Any]:
     result = {"status": "OK", "message": "KB reset and re-ingested"}
     try:
+        try:
+            client.delete_collection(name="knowledge_base")
+        except Exception:
+            pass
         global collection
         collection = client.get_or_create_collection(name="knowledge_base")
         try:
+            if os.path.isfile(BM25_INDEX_FILE):
+                os.remove(BM25_INDEX_FILE)
         except Exception as e:
             result.setdefault("warnings", []).append(f"bm25 index delete: {e}")
         os.makedirs(CHROMA_PATH, exist_ok=True)
         return result
     except Exception as e:
         return {"status": "ERROR", "error": f"{e}", "info": get_kb_runtime_info()}