Spaces:

ChatbotNova
/

Chatbot-Backend

Sleeping

App Files Files Community

srilakshu012456 commited on Dec 23, 2025

Commit

82c195c

verified ·

1 Parent(s): 4c40701

Update services/kb_creation.py

Browse files

Files changed (1) hide show

services/kb_creation.py +90 -159

services/kb_creation.py CHANGED Viewed

@@ -13,10 +13,7 @@ client = chromadb.PersistentClient(path=CHROMA_PATH)
 collection = client.get_or_create_collection(name="knowledge_base")
 # --------------------------- Embedding model ---------------------------
-# You can swap to a multilingual model if you expect mixed language queries:
-# model = SentenceTransformer('paraphrase-multilingual-MiniLM-L12-v2')
-# MODEL_PATH = './models/all-MiniLM-L6-v2'
-# model = SentenceTransformer(MODEL_PATH)
 model = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2")
 # --------------------------- BM25 (lightweight) ---------------------------
@@ -31,24 +28,14 @@ BM25_B = 0.75
 # --------------------------- Utilities ---------------------------
 def _tokenize(text: str) -> List[str]:
-    """
-    Simple tokenizer: lowercase alphanumeric words; removes most punctuation.
-    Keeps stopwords (BM25 can work with them), but normalizes whitespace.
-    """
     if not text:
         return []
     text = text.lower()
-    tokens = re.findall(r"[a-z0-9]+", text)
-    return tokens
 def _normalize_query(q: str) -> str:
-    """
-    Language-agnostic normalization for user queries (no hardcoded domain synonyms).
-    Removes filler verbs, collapses whitespace, lowercases, keeps key terms.
-    """
     q = (q or "").strip().lower()
     q = re.sub(r"[^\w\s]", " ", q)
-    # remove generic filler verbs/common noise words across English variants
     q = re.sub(
         r"\b(facing|get|getting|got|seeing|receiving|encountered|having|observing|issue|problem)\b",
         " ",
@@ -58,17 +45,10 @@ def _normalize_query(q: str) -> str:
     return q
 def _tokenize_meta_value(val: Optional[str]) -> List[str]:
-    if not val:
-        return []
-    return _tokenize(val)
 # --------------------------- DOCX parsing & chunking ---------------------------
 def _split_by_sections(doc: Document) -> List[Tuple[str, List[str]]]:
-    """
-    Split DOCX into (section_title, paragraphs_in_section).
-    Uses paragraph style names: 'Heading 1', 'Heading 2', etc.
-    Falls back to document-level when no headings are present.
-    """
     sections: List[Tuple[str, List[str]]] = []
     current_title = None
     current_paras: List[str] = []
@@ -77,7 +57,6 @@ def _split_by_sections(doc: Document) -> List[Tuple[str, List[str]]]:
         style_name = (para.style.name if para.style else "") or ""
         is_heading = bool(re.match(r"Heading\s*\d+", style_name, flags=re.IGNORECASE))
         if is_heading and text:
-            # commit previous section
             if current_title or current_paras:
                 sections.append((current_title or "Untitled Section", current_paras))
             current_title = text
@@ -85,20 +64,14 @@ def _split_by_sections(doc: Document) -> List[Tuple[str, List[str]]]:
         else:
             if text:
                 current_paras.append(text)
-    # final section
     if current_title or current_paras:
         sections.append((current_title or "Untitled Section", current_paras))
-    # in case no headings at all, make one pseudo-section with all text
     if not sections:
         all_text = [p.text.strip() for p in doc.paragraphs if p.text and p.text.strip()]
         sections = [("Document", all_text)]
     return sections
 def _chunk_text_with_context(doc_title: str, section_title: str, paragraphs: List[str], max_words: int = 900) -> List[str]:
-    """
-    Build chunks from paragraphs ONLY (no doc/section headers in the text).
-    We still keep title/section inside metadata so retrieval quality remains high.
-    """
     body = "\n".join(paragraphs).strip()
     if not body:
         return []
@@ -107,16 +80,13 @@ def _chunk_text_with_context(doc_title: str, section_title: str, paragraphs: Lis
     for i in range(0, len(words), max_words):
         chunk_body = ' '.join(words[i:i + max_words]).strip()
         if chunk_body:
-            chunks.append(chunk_body)  # <-- no headers inside the chunk content
     if not chunks:
         chunks = [body]
     return chunks
 # --------------------------- Intent tagging (auto) ---------------------------
 def _infer_intent_tag(section_title: str) -> str:
-    """
-    Infer coarse intent from section title—no manual curation.
-    """
     st = (section_title or "").lower()
     if any(k in st for k in ["process steps", "procedure", "how to", "workflow", "instructions"]):
         return "steps"
@@ -130,10 +100,6 @@ def _infer_intent_tag(section_title: str) -> str:
 # --------------------------- Ingestion ---------------------------
 def ingest_documents(folder_path: str) -> None:
-    """
-    Read .docx files, section-aware chunking, generate embeddings, store in ChromaDB,
-    and build BM25 inverted index with persistence.
-    """
     print(f"📂 Checking folder: {folder_path}")
     files = [f for f in os.listdir(folder_path) if f.lower().endswith('.docx')]
     print(f"Found {len(files)} Word files: {files}")
@@ -141,13 +107,9 @@ def ingest_documents(folder_path: str) -> None:
         print("⚠️ No .docx files found. Please check the folder path.")
         return
-    # Reset BM25 memory structures
     global bm25_docs, bm25_inverted, bm25_df, bm25_avgdl, bm25_ready
-    bm25_docs = []
-    bm25_inverted = {}
-    bm25_df = {}
-    bm25_avgdl = 0.0
-    bm25_ready = False
     for file in files:
         file_path = os.path.join(folder_path, file)
@@ -161,9 +123,8 @@ def ingest_documents(folder_path: str) -> None:
             total_chunks += len(chunks)
             intent_tag = _infer_intent_tag(section_title)
             for c_idx, chunk in enumerate(chunks):
-                # Embedding & Chroma
                 embedding = model.encode(chunk).tolist()
-                doc_id = f"{file}:{s_idx}:{c_idx}"  # stable unique id
                 meta = {
                     "filename": file,
                     "section": section_title,
@@ -173,49 +134,35 @@ def ingest_documents(folder_path: str) -> None:
                     "intent_tag": intent_tag,  # NEW
                 }
                 try:
-                    collection.add(
-                        ids=[doc_id],
-                        embeddings=[embedding],
-                        documents=[chunk],
-                        metadatas=[meta],
-                    )
                 except Exception:
-                    # upsert on duplicate
                     try:
                         collection.delete(ids=[doc_id])
-                        collection.add(
-                            ids=[doc_id],
-                            embeddings=[embedding],
-                            documents=[chunk],
-                            metadatas=[meta],
-                        )
                     except Exception as e2:
                         print(f"❌ Upsert failed for {doc_id}: {e2}")
-                # BM25 indexing
                 tokens = _tokenize(chunk)
                 tf: Dict[str, int] = {}
                 for t in tokens:
                     tf[t] = tf.get(t, 0) + 1
                 idx = len(bm25_docs)
                 bm25_docs.append({"id": doc_id, "text": chunk, "tokens": tokens, "tf": tf, "length": len(tokens), "meta": meta})
-                # update inverted index & df
-                seen_terms = set()
                 for term in tf.keys():
                     bm25_inverted.setdefault(term, []).append(idx)
-                    if term not in seen_terms:
                         bm25_df[term] = bm25_df.get(term, 0) + 1
-                        seen_terms.add(term)
         print(f"📄 Ingested {file} → {total_chunks} chunks")
-    # finalize BM25 stats
     N = len(bm25_docs)
     if N > 0:
         bm25_avgdl = sum(d["length"] for d in bm25_docs) / float(N)
         bm25_ready = True
-    # persist BM25 index
     payload = {
         "bm25_docs": bm25_docs,
         "bm25_inverted": bm25_inverted,
@@ -231,9 +178,6 @@ def ingest_documents(folder_path: str) -> None:
     print(f"✅ Documents ingested. Total entries in Chroma: {collection.count()}")
 def _load_bm25_index() -> None:
-    """
-    Load persisted BM25 index if available.
-    """
     global bm25_docs, bm25_inverted, bm25_df, bm25_avgdl, bm25_ready
     if not os.path.exists(BM25_INDEX_FILE):
         return
@@ -250,14 +194,10 @@ def _load_bm25_index() -> None:
     except Exception as e:
         print(f"⚠️ Could not load BM25 index: {e}")
-# auto-load on import
 _load_bm25_index()
 # --------------------------- BM25 search ---------------------------
 def _bm25_score_for_doc(query_terms: List[str], doc_idx: int) -> float:
-    """
-    Okapi BM25 score for a given doc.
-    """
     if not bm25_ready or doc_idx < 0 or doc_idx >= len(bm25_docs):
         return 0.0
     doc = bm25_docs[doc_idx]
@@ -270,7 +210,6 @@ def _bm25_score_for_doc(query_terms: List[str], doc_idx: int) -> float:
         tf = doc["tf"].get(term, 0)
         if tf == 0:
             continue
-        # BM25 idf
         N = len(bm25_docs)
         idf_ratio = ((N - df + 0.5) / (df + 0.5))
         try:
@@ -283,25 +222,18 @@ def _bm25_score_for_doc(query_terms: List[str], doc_idx: int) -> float:
     return score
 def bm25_search(query: str, top_k: int = 50) -> List[Tuple[int, float]]:
-    """
-    Returns a list of (doc_idx, score) sorted by score desc.
-    """
     if not bm25_ready:
         return []
     norm = _normalize_query(query)
     q_terms = _tokenize(norm)
     if not q_terms:
         return []
-    # collect candidate doc indices via inverted index
     candidates = set()
     for t in q_terms:
         for idx in bm25_inverted.get(t, []):
             candidates.add(idx)
     if not candidates:
-        # fallback to brute force if no inverted match
         candidates = set(range(len(bm25_docs)))
     scored = []
     for idx in candidates:
         s = _bm25_score_for_doc(q_terms, idx)
@@ -310,32 +242,24 @@ def bm25_search(query: str, top_k: int = 50) -> List[Tuple[int, float]]:
     scored.sort(key=lambda x: x[1], reverse=True)
     return scored[:top_k]
-# --------------------------- Semantic-only (legacy) ---------------------------
 def search_knowledge_base(query: str, top_k: int = 10) -> dict:
-    """
-    Semantic-only search (Chroma). We DO NOT ask for 'ids' in include
-    because some Chroma clients reject it; if 'ids' is present in the
-    response we will use it, otherwise we synthesize stable IDs from metadata.
-    """
     query_embedding = model.encode(query).tolist()
     res = collection.query(
         query_embeddings=[query_embedding],
         n_results=top_k,
-        include=['documents', 'metadatas', 'distances']  # ← no 'ids' here
     )
-    # Flatten lists-per-query
     docs_ll = res.get("documents", [[]]) or [[]]
     metas_ll = res.get("metadatas", [[]]) or [[]]
     dists_ll = res.get("distances", [[]]) or [[]]
-    ids_ll = res.get("ids", [[]]) or [[]]  # some clients still return 'ids' anyway
     documents = docs_ll[0] if docs_ll else []
     metadatas = metas_ll[0] if metas_ll else []
     distances = dists_ll[0] if dists_ll else []
     ids = ids_ll[0] if ids_ll else []
-    # If 'ids' is missing, synthesize stable IDs from metadata
     if not ids and documents:
         synthesized = []
         for i, m in enumerate(metadatas):
@@ -354,23 +278,14 @@ def search_knowledge_base(query: str, top_k: int = 10) -> dict:
         "ids": ids,
     }
-# --------------------------- Hybrid (BM25 + Embeddings) ---------------------------
-def _meta_overlap(meta: Dict[str, Any], q_terms: List[str]) -> float:
-    """
-    Automatic metadata overlap score (no manual per-SOP lists).
-    Uses filename, title, and section tokens. Range ~0..1.
-    """
-    if not meta:
-        return 0.0
-    fn_tokens = _tokenize_meta_value(meta.get("filename"))
-    title_tokens = _tokenize_meta_value(meta.get("title"))
-    section_tokens = _tokenize_meta_value(meta.get("section"))
-    meta_tokens = set(fn_tokens + title_tokens + section_tokens)
-    if not meta_tokens or not q_terms:
-        return 0.0
-    qset = set(q_terms)
-    inter = len(meta_tokens & qset)
-    return inter / max(1, len(qset))
 def _detect_user_intent(query: str) -> str:
     q = (query or "").lower()
@@ -384,42 +299,72 @@ def _detect_user_intent(query: str) -> str:
         return "purpose"
     return "neutral"
 def _intent_weight(meta: dict, user_intent: str) -> float:
     tag = (meta or {}).get("intent_tag", "neutral")
     if user_intent == "neutral":
         return 0.0
     if tag == user_intent:
-        return 1.0     # strong boost when intent matches
     if tag in ["purpose", "prereqs"] and user_intent in ["steps", "errors"]:
-        return -0.6     # penalize overview/prereqs for steps/errors queries
-    return -0.2         # small penalty for other mismatches
 def hybrid_search_knowledge_base(query: str, top_k: int = 10, alpha: float = 0.6, beta: float = 0.4) -> dict:
     """
-    Hybrid retrieval:
-      - Semantic (Chroma/embeddings) → distances (lower = better) → convert to similarity
-      - BM25 keyword → score (higher = better)
-      - Re-rank union of candidates by:
-          final = alpha * semantic_sim + beta * bm25_norm + gamma * meta_overlap + delta * intent_boost
-      - Document-level voting prior: aggregate scores by 'filename' and prefer the best document first.
-    Returns a dict compatible with the extractor and includes:
-      - 'ids': list[str]
-      - 'combined_scores': list[float]
-      - 'best_doc', 'best_doc_prior', 'user_intent'
     """
-    # 1) Normalize query (language-agnostic)
     norm_query = _normalize_query(query)
     q_terms = _tokenize(norm_query)
     user_intent = _detect_user_intent(query)
-    # 2) Semantic candidates (Chroma)
     sem_res = search_knowledge_base(norm_query, top_k=max(top_k, 30))
     sem_docs = sem_res.get("documents", [])
     sem_metas = sem_res.get("metadatas", [])
     sem_dists = sem_res.get("distances", [])
     sem_ids = sem_res.get("ids", [])
-    # Convert distances to 0..1 similarity
     def dist_to_sim(d: Optional[float]) -> float:
         if d is None:
             return 0.0
@@ -430,32 +375,25 @@ def hybrid_search_knowledge_base(query: str, top_k: int = 10, alpha: float = 0.6
     sem_sims = [dist_to_sim(d) for d in sem_dists]
-    # 3) BM25 candidates
     bm25_hits = bm25_search(norm_query, top_k=max(50, top_k * 5))
     bm25_max = max([s for _, s in bm25_hits], default=1.0)
     bm25_norm_pairs = [(idx, (score / bm25_max) if bm25_max > 0 else 0.0) for idx, score in bm25_hits]
-    # 4) Prepare BM25 maps
-    bm25_id_to_norm: Dict[str, float] = {}
-    bm25_id_to_text: Dict[str, str] = {}
-    bm25_id_to_meta: Dict[str, Dict[str, Any]] = {}
     for idx, nscore in bm25_norm_pairs:
         d = bm25_docs[idx]
         bm25_id_to_norm[d["id"]] = nscore
         bm25_id_to_text[d["id"]] = d["text"]
         bm25_id_to_meta[d["id"]] = d["meta"]
-    # 5) Union of candidates
     union_ids = set(sem_ids) | set(bm25_id_to_norm.keys())
-    gamma = 0.25  # metadata overlap weight
-    delta = 0.35  # intent-aware weight
-    combined_records_ext: List[Tuple[str, float, float, str, Dict[str, Any], float, float]] = []  # include overlap+intent
     for cid in union_ids:
-        # semantic part
         if cid in sem_ids:
             pos = sem_ids.index(cid)
             sem_sim = sem_sims[pos] if pos < len(sem_sims) else 0.0
@@ -465,52 +403,44 @@ def hybrid_search_knowledge_base(query: str, top_k: int = 10, alpha: float = 0.6
         else:
             sem_sim, sem_dist, sem_text, sem_meta = 0.0, None, "", {}
-        # bm25 part
         bm25_sim = bm25_id_to_norm.get(cid, 0.0)
         bm25_text = bm25_id_to_text.get(cid, "")
         bm25_meta = bm25_id_to_meta.get(cid, {})
-        # prefer non-empty text/meta
         text = sem_text if sem_text else bm25_text
         meta = sem_meta if sem_meta else bm25_meta
-        # NEW: automatic metadata overlap + intent-aware boost
         m_overlap = _meta_overlap(meta, q_terms)
         intent_boost = _intent_weight(meta, user_intent)
-        # final combined score
-        final_score = alpha * sem_sim + beta * bm25_sim + gamma * m_overlap + delta * intent_boost
         combined_records_ext.append(
-            (cid, final_score, (sem_dist if sem_dist is not None else 999.0), text, meta, m_overlap, intent_boost)
         )
-    # ---------------- Document-level voting prior ----------------
     from collections import defaultdict
-    doc_groups: Dict[str, List[Tuple[str, float, float, str, Dict[str, Any], float, float]]] = defaultdict(list)
     for rec in combined_records_ext:
         meta = rec[4] or {}
         fn = meta.get("filename", "unknown")
         doc_groups[fn].append(rec)
-    # Compute doc_prior = sum(final_score) + bonuses for overlap+intent
-    def doc_prior(recs: List[Tuple[str, float, float, str, Dict[str, Any], float, float]]) -> float:
         total_score = sum(r[1] for r in recs)
         total_overlap = sum(r[5] for r in recs)
-        total_intent = sum(max(0.0, r[6]) for r in recs)  # positive intent boosts
-        total_penalty = sum(min(0.0, r[6]) for r in recs)  # penalties
-        return total_score + 0.4 * total_overlap + 0.6 * total_intent + 0.3 * total_penalty
-    # Pick best document
-    best_doc = None
-    best_doc_prior = -1.0
     for fn, recs in doc_groups.items():
         p = doc_prior(recs)
         if p > best_doc_prior:
-            best_doc_prior = p
-            best_doc = fn
-    # Reorder: take items from best_doc first (sorted by score), then others
     best_recs = sorted(doc_groups.get(best_doc, []), key=lambda x: x[1], reverse=True)
     other_recs = []
     for fn, recs in doc_groups.items():
@@ -534,7 +464,8 @@ def hybrid_search_knowledge_base(query: str, top_k: int = 10, alpha: float = 0.6
         "distances": distances,
         "ids": ids,
         "combined_scores": combined_scores,
-        "best_doc": best_doc,              # helpful for debugging
-        "best_doc_prior": best_doc_prior,  # helpful for debugging
-        "user_intent": user_intent,        # helpful for debugging
     }

 collection = client.get_or_create_collection(name="knowledge_base")
 # --------------------------- Embedding model ---------------------------
+# model = SentenceTransformer('paraphrase-multilingual-MiniLM-L12-v2')  # optional
 model = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2")
 # --------------------------- BM25 (lightweight) ---------------------------
 # --------------------------- Utilities ---------------------------
 def _tokenize(text: str) -> List[str]:
     if not text:
         return []
     text = text.lower()
+    return re.findall(r"[a-z0-9]+", text)
 def _normalize_query(q: str) -> str:
     q = (q or "").strip().lower()
     q = re.sub(r"[^\w\s]", " ", q)
     q = re.sub(
         r"\b(facing|get|getting|got|seeing|receiving|encountered|having|observing|issue|problem)\b",
         " ",
     return q
 def _tokenize_meta_value(val: Optional[str]) -> List[str]:
+    return _tokenize(val or "")
 # --------------------------- DOCX parsing & chunking ---------------------------
 def _split_by_sections(doc: Document) -> List[Tuple[str, List[str]]]:
     sections: List[Tuple[str, List[str]]] = []
     current_title = None
     current_paras: List[str] = []
         style_name = (para.style.name if para.style else "") or ""
         is_heading = bool(re.match(r"Heading\s*\d+", style_name, flags=re.IGNORECASE))
         if is_heading and text:
             if current_title or current_paras:
                 sections.append((current_title or "Untitled Section", current_paras))
             current_title = text
         else:
             if text:
                 current_paras.append(text)
     if current_title or current_paras:
         sections.append((current_title or "Untitled Section", current_paras))
     if not sections:
         all_text = [p.text.strip() for p in doc.paragraphs if p.text and p.text.strip()]
         sections = [("Document", all_text)]
     return sections
 def _chunk_text_with_context(doc_title: str, section_title: str, paragraphs: List[str], max_words: int = 900) -> List[str]:
     body = "\n".join(paragraphs).strip()
     if not body:
         return []
     for i in range(0, len(words), max_words):
         chunk_body = ' '.join(words[i:i + max_words]).strip()
         if chunk_body:
+            chunks.append(chunk_body)  # no doc/section headers inside text
     if not chunks:
         chunks = [body]
     return chunks
 # --------------------------- Intent tagging (auto) ---------------------------
 def _infer_intent_tag(section_title: str) -> str:
     st = (section_title or "").lower()
     if any(k in st for k in ["process steps", "procedure", "how to", "workflow", "instructions"]):
         return "steps"
 # --------------------------- Ingestion ---------------------------
 def ingest_documents(folder_path: str) -> None:
     print(f"📂 Checking folder: {folder_path}")
     files = [f for f in os.listdir(folder_path) if f.lower().endswith('.docx')]
     print(f"Found {len(files)} Word files: {files}")
         print("⚠️ No .docx files found. Please check the folder path.")
         return
     global bm25_docs, bm25_inverted, bm25_df, bm25_avgdl, bm25_ready
+    bm25_docs, bm25_inverted, bm25_df = [], {}, {}
+    bm25_avgdl, bm25_ready = 0.0, False
     for file in files:
         file_path = os.path.join(folder_path, file)
             total_chunks += len(chunks)
             intent_tag = _infer_intent_tag(section_title)
             for c_idx, chunk in enumerate(chunks):
                 embedding = model.encode(chunk).tolist()
+                doc_id = f"{file}:{s_idx}:{c_idx}"
                 meta = {
                     "filename": file,
                     "section": section_title,
                     "intent_tag": intent_tag,  # NEW
                 }
                 try:
+                    collection.add(ids=[doc_id], embeddings=[embedding], documents=[chunk], metadatas=[meta])
                 except Exception:
                     try:
                         collection.delete(ids=[doc_id])
+                        collection.add(ids=[doc_id], embeddings=[embedding], documents=[chunk], metadatas=[meta])
                     except Exception as e2:
                         print(f"❌ Upsert failed for {doc_id}: {e2}")
                 tokens = _tokenize(chunk)
                 tf: Dict[str, int] = {}
                 for t in tokens:
                     tf[t] = tf.get(t, 0) + 1
                 idx = len(bm25_docs)
                 bm25_docs.append({"id": doc_id, "text": chunk, "tokens": tokens, "tf": tf, "length": len(tokens), "meta": meta})
+                seen = set()
                 for term in tf.keys():
                     bm25_inverted.setdefault(term, []).append(idx)
+                    if term not in seen:
                         bm25_df[term] = bm25_df.get(term, 0) + 1
+                        seen.add(term)
         print(f"📄 Ingested {file} → {total_chunks} chunks")
     N = len(bm25_docs)
     if N > 0:
         bm25_avgdl = sum(d["length"] for d in bm25_docs) / float(N)
         bm25_ready = True
     payload = {
         "bm25_docs": bm25_docs,
         "bm25_inverted": bm25_inverted,
     print(f"✅ Documents ingested. Total entries in Chroma: {collection.count()}")
 def _load_bm25_index() -> None:
     global bm25_docs, bm25_inverted, bm25_df, bm25_avgdl, bm25_ready
     if not os.path.exists(BM25_INDEX_FILE):
         return
     except Exception as e:
         print(f"⚠️ Could not load BM25 index: {e}")
 _load_bm25_index()
 # --------------------------- BM25 search ---------------------------
 def _bm25_score_for_doc(query_terms: List[str], doc_idx: int) -> float:
     if not bm25_ready or doc_idx < 0 or doc_idx >= len(bm25_docs):
         return 0.0
     doc = bm25_docs[doc_idx]
         tf = doc["tf"].get(term, 0)
         if tf == 0:
             continue
         N = len(bm25_docs)
         idf_ratio = ((N - df + 0.5) / (df + 0.5))
         try:
     return score
 def bm25_search(query: str, top_k: int = 50) -> List[Tuple[int, float]]:
     if not bm25_ready:
         return []
     norm = _normalize_query(query)
     q_terms = _tokenize(norm)
     if not q_terms:
         return []
     candidates = set()
     for t in q_terms:
         for idx in bm25_inverted.get(t, []):
             candidates.add(idx)
     if not candidates:
         candidates = set(range(len(bm25_docs)))
     scored = []
     for idx in candidates:
         s = _bm25_score_for_doc(q_terms, idx)
     scored.sort(key=lambda x: x[1], reverse=True)
     return scored[:top_k]
+# --------------------------- Semantic-only ---------------------------
 def search_knowledge_base(query: str, top_k: int = 10) -> dict:
     query_embedding = model.encode(query).tolist()
     res = collection.query(
         query_embeddings=[query_embedding],
         n_results=top_k,
+        include=['documents', 'metadatas', 'distances']  # no 'ids' here
     )
     docs_ll = res.get("documents", [[]]) or [[]]
     metas_ll = res.get("metadatas", [[]]) or [[]]
     dists_ll = res.get("distances", [[]]) or [[]]
+    ids_ll = res.get("ids", [[]]) or [[]]
     documents = docs_ll[0] if docs_ll else []
     metadatas = metas_ll[0] if metas_ll else []
     distances = dists_ll[0] if dists_ll else []
     ids = ids_ll[0] if ids_ll else []
     if not ids and documents:
         synthesized = []
         for i, m in enumerate(metadatas):
         "ids": ids,
     }
+# --------------------------- Hybrid (BM25 + Embeddings + Intent + Action) ---------------------------
+ACTION_SYNONYMS = {
+    "create": ["create", "creation", "add", "new", "generate"],
+    "update": ["update", "modify", "change", "edit"],
+    "delete": ["delete", "remove"],
+    "navigate": ["navigate", "go to", "open"],
+    "perform": ["perform", "execute", "do"],
+}
 def _detect_user_intent(query: str) -> str:
     q = (query or "").lower()
         return "purpose"
     return "neutral"
+def _extract_actions(query: str) -> List[str]:
+    q = (query or "").lower()
+    found = []
+    for act, syns in ACTION_SYNONYMS.items():
+        if any(s in q for s in syns):
+            found.append(act)
+    return found or []
 def _intent_weight(meta: dict, user_intent: str) -> float:
     tag = (meta or {}).get("intent_tag", "neutral")
     if user_intent == "neutral":
         return 0.0
     if tag == user_intent:
+        return 1.0
     if tag in ["purpose", "prereqs"] and user_intent in ["steps", "errors"]:
+        return -0.6
+    return -0.2
+def _meta_overlap(meta: Dict[str, Any], q_terms: List[str]) -> float:
+    fn_tokens = _tokenize_meta_value(meta.get("filename"))
+    title_tokens = _tokenize_meta_value(meta.get("title"))
+    section_tokens = _tokenize_meta_value(meta.get("section"))
+    meta_tokens = set(fn_tokens + title_tokens + section_tokens)
+    if not meta_tokens or not q_terms:
+        return 0.0
+    qset = set(q_terms)
+    inter = len(meta_tokens & qset)
+    return inter / max(1, len(qset))
+def _action_weight(text: str, actions: List[str]) -> float:
+    """
+    Boost if text contains target action verb(s); penalize if text dominated by other actions.
+    """
+    if not actions:
+        return 0.0
+    t = (text or "").lower()
+    score = 0.0
+    for act in actions:
+        for syn in ACTION_SYNONYMS.get(act, [act]):
+            if syn in t:
+                score += 1.0  # boost for each matching synonym
+    # Penalize conflicting actions: e.g., query 'create' but text has 'delete' heavily
+    conflicts = {"create": ["delete"], "delete": ["create"], "update": ["delete"], "navigate": [], "perform": []}
+    for act in actions:
+        for bad in conflicts.get(act, []):
+            for syn in ACTION_SYNONYMS.get(bad, [bad]):
+                if syn in t:
+                    score -= 0.8
+    return score
 def hybrid_search_knowledge_base(query: str, top_k: int = 10, alpha: float = 0.6, beta: float = 0.4) -> dict:
     """
+    final = alpha * semantic_sim + beta * bm25_norm + gamma * meta_overlap + delta * intent_boost + epsilon * action_weight
+    + document-level voting prior.
     """
     norm_query = _normalize_query(query)
     q_terms = _tokenize(norm_query)
     user_intent = _detect_user_intent(query)
+    actions = _extract_actions(query)  # NEW
     sem_res = search_knowledge_base(norm_query, top_k=max(top_k, 30))
     sem_docs = sem_res.get("documents", [])
     sem_metas = sem_res.get("metadatas", [])
     sem_dists = sem_res.get("distances", [])
     sem_ids = sem_res.get("ids", [])
     def dist_to_sim(d: Optional[float]) -> float:
         if d is None:
             return 0.0
     sem_sims = [dist_to_sim(d) for d in sem_dists]
     bm25_hits = bm25_search(norm_query, top_k=max(50, top_k * 5))
     bm25_max = max([s for _, s in bm25_hits], default=1.0)
     bm25_norm_pairs = [(idx, (score / bm25_max) if bm25_max > 0 else 0.0) for idx, score in bm25_hits]
+    bm25_id_to_norm, bm25_id_to_text, bm25_id_to_meta = {}, {}, {}
     for idx, nscore in bm25_norm_pairs:
         d = bm25_docs[idx]
         bm25_id_to_norm[d["id"]] = nscore
         bm25_id_to_text[d["id"]] = d["text"]
         bm25_id_to_meta[d["id"]] = d["meta"]
     union_ids = set(sem_ids) | set(bm25_id_to_norm.keys())
+    gamma = 0.25  # meta overlap
+    delta = 0.35  # intent boost
+    epsilon = 0.30  # action weight
+    combined_records_ext: List[Tuple[str, float, float, str, Dict[str, Any], float, float, float]] = []
     for cid in union_ids:
         if cid in sem_ids:
             pos = sem_ids.index(cid)
             sem_sim = sem_sims[pos] if pos < len(sem_sims) else 0.0
         else:
             sem_sim, sem_dist, sem_text, sem_meta = 0.0, None, "", {}
         bm25_sim = bm25_id_to_norm.get(cid, 0.0)
         bm25_text = bm25_id_to_text.get(cid, "")
         bm25_meta = bm25_id_to_meta.get(cid, {})
         text = sem_text if sem_text else bm25_text
         meta = sem_meta if sem_meta else bm25_meta
         m_overlap = _meta_overlap(meta, q_terms)
         intent_boost = _intent_weight(meta, user_intent)
+        act_wt = _action_weight(text, actions)  # NEW
+        final_score = alpha * sem_sim + beta * bm25_sim + gamma * m_overlap + delta * intent_boost + epsilon * act_wt
         combined_records_ext.append(
+            (cid, final_score, (sem_dist if sem_dist is not None else 999.0), text, meta, m_overlap, intent_boost, act_wt)
         )
     from collections import defaultdict
+    doc_groups: Dict[str, List[Tuple[str, float, float, str, Dict[str, Any], float, float, float]]] = defaultdict(list)
     for rec in combined_records_ext:
         meta = rec[4] or {}
         fn = meta.get("filename", "unknown")
         doc_groups[fn].append(rec)
+    def doc_prior(recs: List[Tuple[str, float, float, str, Dict[str, Any], float, float, float]]) -> float:
         total_score = sum(r[1] for r in recs)
         total_overlap = sum(r[5] for r in recs)
+        total_intent = sum(max(0.0, r[6]) for r in recs)
+        total_action = sum(max(0.0, r[7]) for r in recs)
+        total_penalty = sum(min(0.0, r[6]) for r in recs) + sum(min(0.0, r[7]) for r in recs)
+        return total_score + 0.4 * total_overlap + 0.6 * total_intent + 0.5 * total_action + 0.3 * total_penalty
+    best_doc, best_doc_prior = None, -1.0
     for fn, recs in doc_groups.items():
         p = doc_prior(recs)
         if p > best_doc_prior:
+            best_doc_prior, best_doc = p, fn
     best_recs = sorted(doc_groups.get(best_doc, []), key=lambda x: x[1], reverse=True)
     other_recs = []
     for fn, recs in doc_groups.items():
         "distances": distances,
         "ids": ids,
         "combined_scores": combined_scores,
+        "best_doc": best_doc,
+        "best_doc_prior": best_doc_prior,
+        "user_intent": user_intent,
+        "actions": actions,
     }