Spaces:

ChatbotNova
/

Chatbot-Backend

Sleeping

App Files Files Community

srilakshu012456 commited on Dec 26, 2025

Commit

d7af615

verified ·

1 Parent(s): af2ca53

Update services/kb_creation.py

Browse files

Files changed (1) hide show

services/kb_creation.py +181 -401

services/kb_creation.py CHANGED Viewed

@@ -1,22 +1,31 @@
-# services/kb_creation.py
 import os
 import re
 import pickle
 from typing import List, Dict, Any, Tuple, Optional
 from docx import Document
 from sentence_transformers import SentenceTransformer
 import chromadb
-# --------------------------- ChromaDB setup ---------------------------
 CHROMA_PATH = os.path.join(os.getcwd(), "chroma_db")
 client = chromadb.PersistentClient(path=CHROMA_PATH)
 collection = client.get_or_create_collection(name="knowledge_base")
-# --------------------------- Embedding model ---------------------------
 model = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2")
-# --------------------------- BM25 (lightweight) ---------------------------
 BM25_INDEX_FILE = os.path.join(CHROMA_PATH, "bm25_index.pkl")
 bm25_docs: List[Dict[str, Any]] = []
 bm25_inverted: Dict[str, List[int]] = {}
@@ -26,25 +35,51 @@ bm25_ready: bool = False
 BM25_K1 = 1.5
 BM25_B = 0.75
-# --------------------------- Utilities ---------------------------
 def _tokenize(text: str) -> List[str]:
     if not text:
         return []
     text = text.lower()
     return re.findall(r"[a-z0-9]+", text)
 def _normalize_query(q: str) -> str:
     q = (q or "").strip().lower()
     q = re.sub(r"[^\w\s]", " ", q)
     q = re.sub(r"\s+", " ", q).strip()
     return q
-def _tokenize_meta_value(val: Optional[str]) -> List[str]:
-    return _tokenize(val or "")
-# --------------------------- DOCX parsing & chunking ---------------------------
-BULLET_RE = re.compile(r"^\s*(?:[\-\*\u2022]|\d+[.\)]\s+)", re.IGNORECASE)
 def _split_by_sections(doc: Document) -> List[Tuple[str, List[str]]]:
     sections: List[Tuple[str, List[str]]] = []
     current_title = None
@@ -68,12 +103,8 @@ def _split_by_sections(doc: Document) -> List[Tuple[str, List[str]]]:
         sections = [("Document", all_text)]
     return sections
 def _paragraphs_to_lines(paragraphs: List[str]) -> List[str]:
-    """
-    Split paragraphs into bullet-aware lines:
-    - Preserve bullets/numbered list lines as separate atomic lines.
-    - Split long paragraphs by sentence boundaries.
-    """
     lines: List[str] = []
     for p in (paragraphs or []):
         p = (p or "").strip()
@@ -86,8 +117,8 @@ def _paragraphs_to_lines(paragraphs: List[str]) -> List[str]:
         lines.extend(parts)
     return lines
-def _chunk_text_with_context(doc_title: str, section_title: str, paragraphs: List[str], max_words: int = 160) -> List[str]:
-    """Smaller chunks (~160 words), bullet-aware for better recall."""
     lines = _paragraphs_to_lines(paragraphs)
     chunks: List[str] = []
     current: List[str] = []
@@ -113,83 +144,60 @@ def _chunk_text_with_context(doc_title: str, section_title: str, paragraphs: Lis
             chunks = [body]
     return chunks
-# --------------------------- Intent & Module tagging ---------------------------
-SECTION_STEPS_HINTS = ["process steps", "procedure", "how to", "workflow", "instructions", "steps"]
-SECTION_ERRORS_HINTS = ["common errors", "resolution", "troubleshooting", "known issues", "common issues", "escalation", "escalation path", "permissions", "access"]
-PERMISSION_TERMS = [
-    "permission", "permissions", "access", "access right", "authorization", "authorisation",
-    "role", "role access", "role mapping", "security", "security profile", "privilege", "insufficient",
-    "not allowed", "not authorized", "denied", "restrict"
-]
-ERROR_TERMS = ["error", "issue", "fail", "failure", "not working", "cannot", "can't", "mismatch", "locked", "wrong", "denied"]
-STEP_VERBS = ["navigate", "select", "scan", "verify", "confirm", "print", "move", "complete", "click", "open", "choose", "enter", "update", "save", "delete", "create", "attach", "assign"]
-MODULE_VOCAB = {
-    "receiving": [
-        "receive", "receiving", "inbound receiving", "inbound", "goods receipt", "grn",
-        "asn receiving", "unload", "check-in", "dock check-in"
-    ],
-    "appointments": [
-        "appointment", "appointments", "schedule", "scheduling", "slot", "dock door",
-        "appointment creation", "appointment details"
-    ],
-    "picking": ["pick", "picking", "pick release", "wave", "allocation"],
-    "putaway": ["putaway", "staging", "put away", "location assignment"],
-    "shipping": ["shipping", "ship confirm", "outbound", "load", "trailer"],
-    "inventory": ["inventory", "adjustment", "cycle count", "count", "uom"],
-    "replenishment": ["replenishment", "replenish"],
-}
-def _infer_intent_tag(section_title: str) -> str:
-    st = (section_title or "").lower()
-    if any(k in st for k in SECTION_STEPS_HINTS):
-        return "steps"
-    if any(k in st for k in SECTION_ERRORS_HINTS):
-        return "errors"
-    if "pre" in st and "requisite" in st:
-        return "prereqs"
-    if any(k in st for k in ["purpose", "overview", "introduction"]):
-        return "purpose"
-    if any(k in st for k in ["inbound receiving", "receiving", "goods receipt", "grn"]):
-        return "steps"
-    if any(k in st for k in ["appointment", "appointments", "schedule", "scheduling"]):
-        return "steps"
     return "neutral"
-def _derive_semantic_intent_from_text(text: str) -> Tuple[str, List[str]]:
-    t = (text or "").lower()
-    tags: List[str] = []
-    intent = "neutral"
-    if any(term in t for term in PERMISSION_TERMS):
-        intent = "errors"
-        tags.append("permissions")
-    if "role" in t:
-        tags.append("role_access")
-    if "security" in t:
-        tags.append("security")
-    if intent == "neutral" and any(term in t for term in ERROR_TERMS):
-        intent = "errors"
-        tags.append("errors")
-    if intent == "neutral" and any(v in t for v in STEP_VERBS):
-        intent = "steps"
-        tags.append("procedure")
-    return intent, list(set(tags))
 def _derive_module_tags(text: str, filename: str, section_title: str) -> List[str]:
     tokens = " ".join([filename or "", section_title or "", text or ""]).lower()
     found = []
-    for mod, syns in MODULE_VOCAB.items():
-        if any(s in tokens for s in syns):
-            found.append(mod)
-    if not found:
-        if "inventory" in tokens or "adjust" in tokens or "uom" in tokens or "cycle" in tokens:
-            found = ["inventory"]
-        elif "receive" in tokens or "inbound" in tokens or "goods receipt" in tokens or "grn" in tokens:
-            found = ["receiving"]
-        elif "appointment" in tokens or "schedule" in tokens or "dock" in tokens:
-            found = ["appointments"]
     return list(sorted(set(found)))
-# --------------------------- Ingestion ---------------------------
 def ingest_documents(folder_path: str) -> None:
     print(f"[KB] Checking folder: {folder_path}")
     files = [f for f in os.listdir(folder_path) if f.lower().endswith('.docx')]
@@ -208,20 +216,12 @@ def ingest_documents(folder_path: str) -> None:
         doc = Document(file_path)
         sections = _split_by_sections(doc)
         total_chunks = 0
         for s_idx, (section_title, paras) in enumerate(sections):
-            chunks = _chunk_text_with_context(doc_title, section_title, paras, max_words=160)
             total_chunks += len(chunks)
-            base_intent = _infer_intent_tag(section_title)
             for c_idx, chunk in enumerate(chunks):
-                derived_intent, topic_tags = _derive_semantic_intent_from_text(chunk)
-                final_intent = base_intent
-                if derived_intent == "errors":
-                    final_intent = "errors"
-                elif base_intent == "neutral" and derived_intent in ("steps", "prereqs"):
-                    final_intent = derived_intent
                 module_tags = _derive_module_tags(chunk, file, section_title)
                 embedding = model.encode(chunk).tolist()
                 doc_id = f"{file}:{s_idx}:{c_idx}"
@@ -231,11 +231,10 @@ def ingest_documents(folder_path: str) -> None:
                     "chunk_index": c_idx,
                     "title": doc_title,
                     "collection": "SOP",
-                    "intent_tag": final_intent,
-                    "topic_tags": ", ".join(topic_tags) if topic_tags else "",
                     "module_tags": ", ".join(module_tags) if module_tags else "",
                 }
                 try:
                     collection.add(ids=[doc_id], embeddings=[embedding], documents=[chunk], metadatas=[meta])
                 except Exception:
@@ -244,12 +243,10 @@ def ingest_documents(folder_path: str) -> None:
                         collection.add(ids=[doc_id], embeddings=[embedding], documents=[chunk], metadatas=[meta])
                     except Exception as e2:
                         print(f"[KB] ERROR: Upsert failed for {doc_id}: {e2}")
                 tokens = _tokenize(chunk)
                 tf: Dict[str, int] = {}
                 for tkn in tokens:
                     tf[tkn] = tf.get(tkn, 0) + 1
                 idx = len(bm25_docs)
                 bm25_docs.append({
                     "id": doc_id,
@@ -259,14 +256,12 @@ def ingest_documents(folder_path: str) -> None:
                     "length": len(tokens),
                     "meta": meta,
                 })
                 seen = set()
                 for term in tf.keys():
                     bm25_inverted.setdefault(term, []).append(idx)
                     if term not in seen:
                         bm25_df[term] = bm25_df.get(term, 0) + 1
                         seen.add(term)
         print(f"[KB] Ingested {file} → {total_chunks} chunks")
     N = len(bm25_docs)
@@ -287,7 +282,7 @@ def ingest_documents(folder_path: str) -> None:
     print(f"[KB] BM25 index saved: {BM25_INDEX_FILE}")
     print(f"[KB] Documents ingested. Total entries in Chroma: {collection.count()}")
-# --------------------------- BM25 load ---------------------------
 def _load_bm25_index() -> None:
     global bm25_docs, bm25_inverted, bm25_df, bm25_avgdl, bm25_ready
     if not os.path.exists(BM25_INDEX_FILE):
@@ -307,7 +302,7 @@ def _load_bm25_index() -> None:
 _load_bm25_index()
-# --------------------------- BM25 search ---------------------------
 def _bm25_score_for_doc(query_terms: List[str], doc_idx: int) -> float:
     if not bm25_ready or doc_idx < 0 or doc_idx >= len(bm25_docs):
         return 0.0
@@ -322,31 +317,28 @@ def _bm25_score_for_doc(query_terms: List[str], doc_idx: int) -> float:
         if tf == 0:
             continue
         N = len(bm25_docs)
-        idf_ratio = ((N - df + 0.5) / (df + 0.5))
         try:
             import math
-            idf = math.log(idf_ratio + 1.0)
         except Exception:
             idf = 1.0
         denom = tf + BM25_K1 * (1 - BM25_B + BM25_B * (dl / (bm25_avgdl or 1.0)))
         score += idf * (((tf * (BM25_K1 + 1)) / (denom or 1.0)))
     return score
 def bm25_search(query: str, top_k: int = 50) -> List[Tuple[int, float]]:
     if not bm25_ready:
         return []
-    norm = _normalize_query(query)
-    q_terms = _tokenize(norm)
     if not q_terms:
         return []
     candidates = set()
     for t in q_terms:
         for idx in bm25_inverted.get(t, []):
             candidates.add(idx)
     if not candidates:
         candidates = set(range(len(bm25_docs)))
     scored = []
     for idx in candidates:
         s = _bm25_score_for_doc(q_terms, idx)
@@ -355,19 +347,17 @@ def bm25_search(query: str, top_k: int = 50) -> List[Tuple[int, float]]:
     scored.sort(key=lambda x: x[1], reverse=True)
     return scored[:top_k]
-# --------------------------- Semantic-only ---------------------------
 def search_knowledge_base(query: str, top_k: int = 10) -> dict:
     query_embedding = model.encode(query).tolist()
     res = collection.query(
         query_embeddings=[query_embedding],
         n_results=top_k,
-        include=['documents', 'metadatas', 'distances']  # no 'ids'
     )
     documents = (res.get("documents", [[]]) or [[]])[0]
     metadatas = (res.get("metadatas", [[]]) or [[]])[0]
     distances = (res.get("distances", [[]]) or [[]])[0]
-    # Synthesize IDs from metadata (filename:section:chunk_index)
     ids: List[str] = []
     if documents:
         synthesized = []
@@ -377,8 +367,6 @@ def search_knowledge_base(query: str, top_k: int = 10) -> dict:
             idx = (m or {}).get("chunk_index", i)
             synthesized.append(f"{fn}:{sec}:{idx}")
         ids = synthesized
-    print(f"[KB] search → {len(documents)} docs (top_k={top_k}); first distance: {distances[0] if distances else 'n/a'}; ids synthesized={len(ids)}")
     return {
         "documents": documents,
         "metadatas": metadatas,
@@ -386,158 +374,47 @@ def search_knowledge_base(query: str, top_k: int = 10) -> dict:
         "ids": ids,
     }
-# --------------------------- Hybrid search (improved + exact-match rerank) ---------------------------
-ACTION_SYNONYMS = {
-    "create": ["create", "creation", "add", "new", "generate", "book", "schedule", "set up"],
-    "update": ["update", "modify", "change", "edit", "reschedule", "adjust", "move"],
-    "delete": ["delete", "remove"],
-    "navigate": ["navigate", "go to", "open"],
-}
-ERROR_INTENT_TERMS = [
-    "error", "issue", "fail", "not working", "resolution", "fix",
-    "permission", "permissions", "access", "no access", "authorization", "authorisation",
-    "role", "role mapping", "not authorized", "permission denied", "insufficient privileges",
-    "escalation", "escalation path", "access right", "mismatch", "locked", "wrong"
-]
-def _detect_user_intent(query: str) -> str:
-    q = (query or "").lower()
-    if any(k in q for k in ERROR_INTENT_TERMS):
-        return "errors"
-    if any(k in q for k in ["steps", "procedure", "how to", "navigate", "process", "do", "perform", "receiving"]):
-        return "steps"
-    if any(k in q for k in ["pre-requisite", "prerequisites", "requirement", "requirements"]):
-        return "prereqs"
-    if any(k in q for k in ["purpose", "overview", "introduction"]):
-        return "purpose"
-    return "neutral"
-def _extract_actions(query: str) -> List[str]:
     q = (query or "").lower()
     found = []
     for act, syns in ACTION_SYNONYMS.items():
         if any(s in q for s in syns):
             found.append(act)
-    if any(w in q for w in ["receive", "receiving", "grn", "goods receipt", "inbound"]):
-        found.append("navigate")
-    return list(sorted(set(found))) or []
-def _extract_modules_from_query(query: str) -> List[str]:
-    q = (query or "").lower()
-    found = []
-    for mod, syns in MODULE_VOCAB.items():
-        if any(s in q for s in syns):
-            found.append(mod)
-    if not found and any(w in q for w in ["receive", "receiving", "grn", "goods receipt", "inbound"]):
-        found = ["receiving"]
-    if "receiving" in found and "appointments" in found:
-        return ["receiving"]
     return list(sorted(set(found)))
-def _action_weight(text: str, actions: List[str]) -> float:
-    if not actions:
-        return 0.0
-    t = (text or "").lower()
-    score = 0.0
-    for act in actions:
-        for syn in ACTION_SYNONYMS.get(act, [act]):
-            if syn in t:
-                score += 1.0
-    # conflict matrix: penalize mismatched operations (e.g., user wants update but chunk talks about create)
-    conflicts = {"create": ["delete"], "delete": ["create"], "update": ["create", "delete"], "navigate": []}
-    for act in actions:
-        for bad in conflicts.get(act, []):
-            for syn in ACTION_SYNONYMS.get(bad, [bad]):
-                if syn in t:
-                    score -= 0.8
-    return score
-def _module_weight(meta: Dict[str, Any], user_modules: List[str]) -> float:
-    if not user_modules:
-        return 0.0
-    raw = (meta or {}).get("module_tags", "") or ""
-    doc_modules = [m.strip() for m in raw.split(",") if m.strip()] if isinstance(raw, str) else (raw or [])
-    overlap = len(set(user_modules) & set(doc_modules))
-    if overlap == 0:
-        return -0.8
-    return 0.7 * overlap
-def _intent_weight(meta: dict, user_intent: str) -> float:
-    tag = (meta or {}).get("intent_tag", "neutral")
-    if user_intent == "neutral":
-        return 0.0
-    if tag == user_intent:
-        return 1.0
-    if tag in ["purpose", "prereqs"] and user_intent in ["steps", "errors"]:
-        return -0.6
-    st = ((meta or {}).get("section", "") or "").lower()
-    topics = (meta or {}).get("topic_tags", "") or ""
-    topic_list = [t.strip() for t in topics.split(",") if t.strip()]
-    # Prefer errors sections strongly
-    if user_intent == "errors" and (
-        any(k in st for k in ["common errors", "known issues", "common issues", "errors", "escalation", "permissions", "access"])
-        or ("permissions" in topic_list)
-    ):
-        return 1.10
-    if user_intent == "steps" and any(k in st for k in ["inbound receiving", "receiving", "goods receipt", "grn"]):
-        return 0.75
-    return -0.2
 def _meta_overlap(meta: Dict[str, Any], q_terms: List[str]) -> float:
-    fn_tokens = _tokenize_meta_value(meta.get("filename"))
-    title_tokens = _tokenize_meta_value(meta.get("title"))
-    section_tokens = _tokenize_meta_value(meta.get("section"))
-    topic_tokens = _tokenize_meta_value((meta.get("topic_tags") or ""))
-    module_tokens = _tokenize_meta_value((meta.get("module_tags") or ""))
-    meta_tokens = set(fn_tokens + title_tokens + section_tokens + topic_tokens + module_tokens)
     if not meta_tokens or not q_terms:
         return 0.0
-    qset = set(q_terms)
-    inter = len(meta_tokens & qset)
-    return inter / max(1, len(qset))
-def _make_ngrams(tokens: List[str], n: int) -> List[str]:
-    return [" ".join(tokens[i:i+n]) for i in range(len(tokens) - n + 1)]
-def _phrase_boost_score(text: str, q_terms: List[str]) -> float:
-    if not text or not q_terms:
-        return 0.0
-    low = (text or "").lower()
-    bigrams = _make_ngrams(q_terms, 2)
-    trigrams = _make_ngrams(q_terms, 3)
-    score = 0.0
-    for bg in bigrams:
-        if bg and bg in low:
-            score += 0.40
-    for tg in trigrams:
-        if tg and tg in low:
-            score += 0.70
-    return min(score, 2.0)
-def _literal_query_match_boost(text: str, query_norm: str) -> float:
-    """Extra boost if exact normalized query substring or bigrams appear."""
-    t = (text or "").lower()
-    q = (query_norm or "").lower()
-    boost = 0.0
-    if q and q in t:
-        boost += 0.8
-    toks = [tok for tok in q.split() if len(tok) > 2]
-    bigrams = _make_ngrams(toks, 2)
-    for bg in bigrams:
-        if bg in t:
-            boost += 0.8
-            break
-    return min(boost, 1.6)
-def hybrid_search_knowledge_base(query: str, top_k: int = 10, alpha: float = 0.6, beta: float = 0.4) -> dict:
-    norm_query = _normalize_query(query)
-    q_terms = _tokenize(norm_query)
-    user_intent = _detect_user_intent(query)
-    actions = _extract_actions(query)
-    user_modules = _extract_modules_from_query(query)
-    sem_res = search_knowledge_base(norm_query, top_k=max(top_k, 40))
     sem_docs = sem_res.get("documents", [])
     sem_metas = sem_res.get("metadatas", [])
     sem_dists = sem_res.get("distances", [])
@@ -553,9 +430,10 @@ def hybrid_search_knowledge_base(query: str, top_k: int = 10, alpha: float = 0.6
     sem_sims = [dist_to_sim(d) for d in sem_dists]
-    bm25_hits = bm25_search(norm_query, top_k=max(80, top_k * 6))
     bm25_max = max([s for _, s in bm25_hits], default=1.0)
     bm25_norm_pairs = [(idx, (score / bm25_max) if bm25_max > 0 else 0.0) for idx, score in bm25_hits]
     bm25_id_to_norm, bm25_id_to_text, bm25_id_to_meta = {}, {}, {}
     for idx, nscore in bm25_norm_pairs:
         d = bm25_docs[idx]
@@ -565,15 +443,7 @@ def hybrid_search_knowledge_base(query: str, top_k: int = 10, alpha: float = 0.6
     union_ids = set(sem_ids) | set(bm25_id_to_norm.keys())
-    gamma = 0.30  # meta overlap
-    delta = 0.55  # intent boost (stronger)
-    epsilon = 0.30  # action weight
-    zeta = 0.65  # module weight
-    eta = 0.50  # phrase-level boost (stronger)
-    theta = 0.40  # heading alignment bonus
-    iota = 0.60  # literal query match boost (stronger)
-    combined_records_ext: List[Tuple[str, float, float, str, Dict[str, Any], float, float, float, float, float, float, float]] = []
     for cid in union_ids:
         if cid in sem_ids:
             pos = sem_ids.index(cid)
@@ -591,128 +461,48 @@ def hybrid_search_knowledge_base(query: str, top_k: int = 10, alpha: float = 0.6
         text = sem_text if sem_text else bm25_text
         meta = sem_meta if sem_meta else bm25_meta
-        m_overlap = _meta_overlap(meta, q_terms)
-        intent_boost = _intent_weight(meta, user_intent)
-        act_wt = _action_weight(text, actions)
-        mod_wt = _module_weight(meta, user_modules)
-        phrase_wt = _phrase_boost_score(text, q_terms)
-        literal_wt = _literal_query_match_boost(text, norm_query)
-        sec_low = ((meta or {}).get("section", "") or "").lower()
-        title_low = ((meta or {}).get("title", "") or "").lower()
-        heading_bonus = 0.0
-        if any(root in sec_low for root in ["receiving", "inbound receiving", "goods receipt", "grn"]) and any(w in norm_query for w in ["receive", "receiving", "inbound", "grn", "goods receipt"]):
-            heading_bonus += 0.40
-        if any(root in title_low for root in ["receiving", "inbound receiving", "goods receipt", "grn"]) and any(w in norm_query for w in ["receive", "receiving", "inbound", "grn", "goods receipt"]):
-            heading_bonus += 0.40
-        if any(root in sec_low for root in ["appointment", "appointments", "schedule"]) and "receiv" in norm_query:
-            heading_bonus -= 0.35
-        final_score = (
-            alpha * sem_sim
-            + beta * bm25_sim
-            + gamma * m_overlap
-            + delta * intent_boost
-            + epsilon * act_wt
-            + zeta * mod_wt
-            + eta * phrase_wt
-            + theta * heading_bonus
-            + iota * literal_wt
-        )
-        combined_records_ext.append(
-            (cid, final_score, (sem_dist if sem_dist is not None else 999.0), text, meta, m_overlap, intent_boost, act_wt, mod_wt, phrase_wt, heading_bonus, literal_wt)
-        )
-    # ---- Exact-match rerank for errors ----
-    if user_intent == "errors":
-        exact_hits = []
-        for rec in combined_records_ext:
-            text_lower = (rec[3] or "").lower()
-            if any(phrase in text_lower for phrase in [
-                norm_query,
-                *(_make_ngrams([tok for tok in norm_query.split() if len(tok) > 2], 2))
-            ]):
-                exact_hits.append(rec)
-        if exact_hits:
-            # Move exact hits to front and keep order by current final_score
-            rest = [r for r in combined_records_ext if r not in exact_hits]
-            exact_hits.sort(key=lambda x: x[1], reverse=True)
-            rest.sort(key=lambda x: x[1], reverse=True)
-            combined_records_ext = exact_hits + rest
-    from collections import defaultdict
-    doc_groups: Dict[str, List[Tuple[str, float, float, str, Dict[str, Any], float, float, float, float, float, float, float]]] = defaultdict(list)
-    for rec in combined_records_ext:
-        meta = rec[4] or {}
-        fn = meta.get("filename", "unknown")
-        doc_groups[fn].append(rec)
-    def doc_prior(recs: List[Tuple[str, float, float, str, Dict[str, Any], float, float, float, float, float, float, float]]) -> float:
-        total_score = sum(r[1] for r in recs)
-        total_overlap = sum(r[5] for r in recs)
-        total_intent = sum(max(0.0, r[6]) for r in recs)
-        total_action = sum(max(0.0, r[7]) for r in recs)
-        total_module = sum(r[8] for r in recs)
-        total_phrase = sum(r[9] for r in recs)
-        total_heading = sum(r[10] for r in recs)
-        total_literal = sum(r[11] for r in recs)
-        total_penalty = sum(min(0.0, r[6]) for r in recs) + sum(min(0.0, r[7]) for r in recs)
-        # Errors doc prior: bonus for errors/known issues sections
-        errors_section_bonus = 0.0
-        if any("error" in ((r[4] or {}).get("section", "")).lower() or "known issues" in ((r[4] or {}).get("section", "")).lower()
-               or "common issues" in ((r[4] or {}).get("section", "")).lower() for r in recs):
-            errors_section_bonus = 0.5
-        return (
-            total_score
-            + 0.4 * total_overlap
-            + 0.7 * total_intent
-            + 0.5 * total_action
-            + 0.8 * total_module
-            + 0.6 * total_phrase
-            + 0.6 * total_heading
-            + 0.7 * total_literal
-            + errors_section_bonus
-            + 0.3 * total_penalty
-        )
-    best_doc, best_doc_prior = None, -1.0
-    for fn, recs in doc_groups.items():
-        p = doc_prior(recs)
-        if p > best_doc_prior:
-            best_doc_prior, best_doc = p, fn
-    best_recs = sorted(doc_groups.get(best_doc, []), key=lambda x: x[1], reverse=True)
-    other_recs: List[Tuple[str, float, float, str, Dict[str, Any], float, float, float, float, float, float, float]] = []
-    for fn, recs in doc_groups.items():
-        if fn == best_doc:
-            continue
-        other_recs.extend(recs)
-    other_recs.sort(key=lambda x: x[1], reverse=True)
-    reordered = best_recs + other_recs
-    top = reordered[:top_k]
-    documents = [t[3] for t in top]
-    metadatas = [t[4] for t in top]
-    distances = [t[2] for t in top]
-    ids = [t[0] for t in top]
-    combined_scores = [t[1] for t in top]
     return {
-        "documents": documents,
-        "metadatas": metadatas,
-        "distances": distances,
-        "ids": ids,
-        "combined_scores": combined_scores,
-        "best_doc": best_doc,
-        "best_doc_prior": best_doc_prior,
-        "user_intent": user_intent,
-        "actions": actions,
     }
-# --------------------------- Section fetch helpers ---------------------------
 def get_section_text(filename: str, section: str) -> str:
     texts: List[str] = []
     for d in bm25_docs:
@@ -721,7 +511,8 @@ def get_section_text(filename: str, section: str) -> str:
             t = (d.get("text") or "").strip()
             if t:
                 texts.append(t)
-    return "\n\n".join(texts).strip()
 def get_best_steps_section_text(filename: str) -> str:
     texts: List[str] = []
@@ -731,32 +522,22 @@ def get_best_steps_section_text(filename: str) -> str:
             t = (d.get("text") or "").strip()
             if t:
                 texts.append(t)
-    return "\n\n".join(texts).strip()
 def get_best_errors_section_text(filename: str) -> str:
     texts: List[str] = []
     for d in bm25_docs:
         m = d.get("meta", {})
         sec = (m.get("section") or "").lower()
-        topics = (m.get("topic_tags") or "")
-        topic_list = [t.strip() for t in topics.split(",") if t.strip()]
-        if m.get("filename") == filename and (
-            m.get("intent_tag") == "errors"
-            or "error" in sec
-            or "escalation" in sec
-            or "permission" in sec
-            or "access" in sec
-            or "known issues" in sec
-            or "common issues" in sec
-            or "errors" in sec
-            or ("permissions" in topic_list)
-        ):
             t = (d.get("text") or "").strip()
             if t:
                 texts.append(t)
-    return "\n\n".join(texts).strip()
-# --------------------------- Admin helpers ---------------------------
 def get_kb_runtime_info() -> Dict[str, Any]:
     return {
         "chroma_path": CHROMA_PATH,
@@ -767,6 +548,7 @@ def get_kb_runtime_info() -> Dict[str, Any]:
         "bm25_ready": bm25_ready,
     }
 def reset_kb(folder_path: str) -> Dict[str, Any]:
     result = {"status": "OK", "message": "KB reset and re-ingested"}
     try:
@@ -776,13 +558,11 @@ def reset_kb(folder_path: str) -> Dict[str, Any]:
             pass
         global collection
         collection = client.get_or_create_collection(name="knowledge_base")
         try:
             if os.path.isfile(BM25_INDEX_FILE):
                 os.remove(BM25_INDEX_FILE)
         except Exception as e:
             result.setdefault("warnings", []).append(f"bm25 index delete: {e}")
         os.makedirs(CHROMA_PATH, exist_ok=True)
         ingest_documents(folder_path)
         result["info"] = get_kb_runtime_info()

+# kb_creation.py (single file)
+# ---------------------------------------------------------------
+# Action-aware KB ingestion + hybrid search for SOP documents.
+# Tags each chunk with intent (steps/errors), module (appointments,
+# receiving, etc.), and action (create/update/delete). Hybrid ranking
+# rewards action alignment and penalizes conflicts so "update
+# appointment" returns update/reschedule steps—NOT creation.
+# ---------------------------------------------------------------
 import os
 import re
 import pickle
 from typing import List, Dict, Any, Tuple, Optional
 from docx import Document
 from sentence_transformers import SentenceTransformer
 import chromadb
+# -------------------------- ChromaDB setup --------------------------
 CHROMA_PATH = os.path.join(os.getcwd(), "chroma_db")
 client = chromadb.PersistentClient(path=CHROMA_PATH)
 collection = client.get_or_create_collection(name="knowledge_base")
+# -------------------------- Embedding model -------------------------
 model = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2")
+# -------------------------- BM25 (lightweight) ----------------------
 BM25_INDEX_FILE = os.path.join(CHROMA_PATH, "bm25_index.pkl")
 bm25_docs: List[Dict[str, Any]] = []
 bm25_inverted: Dict[str, List[int]] = {}
 BM25_K1 = 1.5
 BM25_B = 0.75
+# -------------------------- Vocab & Heuristics ----------------------
+APPT_WORDS = ["appointment", "appointments", "schedule", "scheduling", "dock door", "slot"]
+CREATE_WORDS = ["create", "creation", "new", "add", "generate"]
+UPDATE_WORDS = ["update", "modify", "change", "edit", "reschedule", "re-schedule", "revise"]
+DELETE_WORDS = ["delete", "remove", "cancel", "void"]
+ACTION_SYNONYMS = {
+    "create": CREATE_WORDS,
+    "update": UPDATE_WORDS,
+    "delete": DELETE_WORDS,
+    "navigate": ["navigate", "go to", "open"],
+}
+ACTION_CONFLICTS = {
+    "update": ["create", "delete"],
+    "create": ["update", "delete"],
+    "delete": ["create", "update"],
+}
+SECTION_STEPS_HINTS = ["process steps", "procedure", "how to", "workflow", "instructions", "steps"]
+SECTION_ERRORS_HINTS = ["common errors", "resolution", "troubleshooting", "known issues", "common issues", "escalation", "escalation path", "permissions", "access"]
+ERROR_TERMS = ["error", "issue", "fail", "failure", "not working", "cannot", "can't", "mismatch", "locked", "wrong", "denied"]
+BULLET_RE = re.compile(r"^\s*(?:[\-\*•]|\d+[\.)])\s+", re.IGNORECASE)
+# -------------------------- Utils ----------------------------------
 def _tokenize(text: str) -> List[str]:
     if not text:
         return []
     text = text.lower()
     return re.findall(r"[a-z0-9]+", text)
 def _normalize_query(q: str) -> str:
     q = (q or "").strip().lower()
     q = re.sub(r"[^\w\s]", " ", q)
     q = re.sub(r"\s+", " ", q).strip()
     return q
+def _contains_any(text: str, words: List[str]) -> bool:
+    low = (text or "").lower()
+    return any(w in low for w in words)
+# -------------------------- DOCX parsing ----------------------------
 def _split_by_sections(doc: Document) -> List[Tuple[str, List[str]]]:
     sections: List[Tuple[str, List[str]]] = []
     current_title = None
         sections = [("Document", all_text)]
     return sections
 def _paragraphs_to_lines(paragraphs: List[str]) -> List[str]:
     lines: List[str] = []
     for p in (paragraphs or []):
         p = (p or "").strip()
         lines.extend(parts)
     return lines
+def _chunk_text_with_context(paragraphs: List[str], max_words: int = 140) -> List[str]:
     lines = _paragraphs_to_lines(paragraphs)
     chunks: List[str] = []
     current: List[str] = []
             chunks = [body]
     return chunks
+# -------------------------- Tagging ---------------------------------
+def _nearest_action_to_subject(text: str, subject_words: List[str]) -> Optional[str]:
+    """Pick action based on proximity to subject tokens (e.g., appointment)."""
+    low = (text or "").lower()
+    best = None
+    best_pos = 10**9
+    for subj in subject_words:
+        for m in re.finditer(re.escape(subj), low):
+            pos = m.start()
+            window = low[max(0, pos-80): pos+120]
+            for act, syns in [("update", UPDATE_WORDS), ("create", CREATE_WORDS), ("delete", DELETE_WORDS)]:
+                if any(s in window for s in syns):
+                    if pos < best_pos:
+                        best, best_pos = act, pos
+    return best
+def _classify_action(text: str, filename: str, section: str) -> str:
+    tokens = " ".join([filename or "", section or "", text or ""]).lower()
+    prox = _nearest_action_to_subject(tokens, APPT_WORDS)
+    if prox:
+        return prox
+    if _contains_any(tokens, UPDATE_WORDS):
+        return "update"
+    if _contains_any(tokens, CREATE_WORDS):
+        return "create"
+    if _contains_any(tokens, DELETE_WORDS):
+        return "delete"
     return "neutral"
 def _derive_module_tags(text: str, filename: str, section_title: str) -> List[str]:
     tokens = " ".join([filename or "", section_title or "", text or ""]).lower()
     found = []
+    if any(w in tokens for w in APPT_WORDS):
+        found.append("appointments")
+    if any(w in tokens for w in ["receive", "receiving", "inbound", "goods receipt", "grn"]):
+        found.append("receiving")
+    if not found and ("dock" in tokens or "door" in tokens):
+        found.append("appointments")
     return list(sorted(set(found)))
+def _infer_intent_tag(section_title: str, text: str) -> str:
+    st = (section_title or "").lower()
+    if any(k in st for k in SECTION_STEPS_HINTS):
+        return "steps"
+    if any(k in st for k in SECTION_ERRORS_HINTS):
+        return "errors"
+    if any(t in (text or "").lower() for t in ERROR_TERMS):
+        return "errors"
+    return "steps"
+# -------------------------- Ingestion -------------------------------
 def ingest_documents(folder_path: str) -> None:
     print(f"[KB] Checking folder: {folder_path}")
     files = [f for f in os.listdir(folder_path) if f.lower().endswith('.docx')]
         doc = Document(file_path)
         sections = _split_by_sections(doc)
         total_chunks = 0
         for s_idx, (section_title, paras) in enumerate(sections):
+            chunks = _chunk_text_with_context(paras, max_words=140)
             total_chunks += len(chunks)
             for c_idx, chunk in enumerate(chunks):
+                action_tag = _classify_action(chunk, file, section_title)
+                intent_tag = _infer_intent_tag(section_title, chunk)
                 module_tags = _derive_module_tags(chunk, file, section_title)
                 embedding = model.encode(chunk).tolist()
                 doc_id = f"{file}:{s_idx}:{c_idx}"
                     "chunk_index": c_idx,
                     "title": doc_title,
                     "collection": "SOP",
+                    "intent_tag": intent_tag,
+                    "action_tag": action_tag,
                     "module_tags": ", ".join(module_tags) if module_tags else "",
                 }
                 try:
                     collection.add(ids=[doc_id], embeddings=[embedding], documents=[chunk], metadatas=[meta])
                 except Exception:
                         collection.add(ids=[doc_id], embeddings=[embedding], documents=[chunk], metadatas=[meta])
                     except Exception as e2:
                         print(f"[KB] ERROR: Upsert failed for {doc_id}: {e2}")
                 tokens = _tokenize(chunk)
                 tf: Dict[str, int] = {}
                 for tkn in tokens:
                     tf[tkn] = tf.get(tkn, 0) + 1
                 idx = len(bm25_docs)
                 bm25_docs.append({
                     "id": doc_id,
                     "length": len(tokens),
                     "meta": meta,
                 })
                 seen = set()
                 for term in tf.keys():
                     bm25_inverted.setdefault(term, []).append(idx)
                     if term not in seen:
                         bm25_df[term] = bm25_df.get(term, 0) + 1
                         seen.add(term)
         print(f"[KB] Ingested {file} → {total_chunks} chunks")
     N = len(bm25_docs)
     print(f"[KB] BM25 index saved: {BM25_INDEX_FILE}")
     print(f"[KB] Documents ingested. Total entries in Chroma: {collection.count()}")
+# -------------------------- BM25 load/search ------------------------
 def _load_bm25_index() -> None:
     global bm25_docs, bm25_inverted, bm25_df, bm25_avgdl, bm25_ready
     if not os.path.exists(BM25_INDEX_FILE):
 _load_bm25_index()
 def _bm25_score_for_doc(query_terms: List[str], doc_idx: int) -> float:
     if not bm25_ready or doc_idx < 0 or doc_idx >= len(bm25_docs):
         return 0.0
         if tf == 0:
             continue
         N = len(bm25_docs)
         try:
             import math
+            idf = math.log(((N - df + 0.5) / (df + 0.5)) + 1.0)
         except Exception:
             idf = 1.0
         denom = tf + BM25_K1 * (1 - BM25_B + BM25_B * (dl / (bm25_avgdl or 1.0)))
         score += idf * (((tf * (BM25_K1 + 1)) / (denom or 1.0)))
     return score
 def bm25_search(query: str, top_k: int = 50) -> List[Tuple[int, float]]:
     if not bm25_ready:
         return []
+    q_terms = _tokenize(_normalize_query(query))
     if not q_terms:
         return []
     candidates = set()
     for t in q_terms:
         for idx in bm25_inverted.get(t, []):
             candidates.add(idx)
     if not candidates:
         candidates = set(range(len(bm25_docs)))
     scored = []
     for idx in candidates:
         s = _bm25_score_for_doc(q_terms, idx)
     scored.sort(key=lambda x: x[1], reverse=True)
     return scored[:top_k]
+# -------------------------- Semantic search -------------------------
 def search_knowledge_base(query: str, top_k: int = 10) -> dict:
     query_embedding = model.encode(query).tolist()
     res = collection.query(
         query_embeddings=[query_embedding],
         n_results=top_k,
+        include=['documents', 'metadatas', 'distances']
     )
     documents = (res.get("documents", [[]]) or [[]])[0]
     metadatas = (res.get("metadatas", [[]]) or [[]])[0]
     distances = (res.get("distances", [[]]) or [[]])[0]
     ids: List[str] = []
     if documents:
         synthesized = []
             idx = (m or {}).get("chunk_index", i)
             synthesized.append(f"{fn}:{sec}:{idx}")
         ids = synthesized
     return {
         "documents": documents,
         "metadatas": metadatas,
         "ids": ids,
     }
+# -------------------------- Hybrid ranking --------------------------
+def _detect_user_action(query: str) -> List[str]:
     q = (query or "").lower()
     found = []
     for act, syns in ACTION_SYNONYMS.items():
         if any(s in q for s in syns):
             found.append(act)
+    if "reschedule" in q or "re-schedule" in q:
+        found.append("update")
     return list(sorted(set(found)))
+def _detect_user_modules(query: str) -> List[str]:
+    q = (query or "").lower()
+    mods = []
+    if any(w in q for w in APPT_WORDS):
+        mods.append("appointments")
+    if any(w in q for w in ["receive", "receiving", "inbound", "goods receipt", "grn"]):
+        mods.append("receiving")
+    return list(sorted(set(mods)))
 def _meta_overlap(meta: Dict[str, Any], q_terms: List[str]) -> float:
+    fn = _tokenize((meta or {}).get("filename", ""))
+    sec = _tokenize((meta or {}).get("section", ""))
+    title = _tokenize((meta or {}).get("title", ""))
+    mods = _tokenize((meta or {}).get("module_tags", ""))
+    meta_tokens = set(fn + sec + title + mods)
     if not meta_tokens or not q_terms:
         return 0.0
+    inter = len(meta_tokens & set(q_terms))
+    return inter / max(1, len(q_terms))
+def hybrid_search_knowledge_base(query: str, top_k: int = 10) -> dict:
+    norm_q = _normalize_query(query)
+    q_terms = _tokenize(norm_q)
+    user_actions = _detect_user_action(query)
+    user_modules = _detect_user_modules(query)
+    sem_res = search_knowledge_base(norm_q, top_k=max(top_k, 40))
     sem_docs = sem_res.get("documents", [])
     sem_metas = sem_res.get("metadatas", [])
     sem_dists = sem_res.get("distances", [])
     sem_sims = [dist_to_sim(d) for d in sem_dists]
+    bm25_hits = bm25_search(norm_q, top_k=max(80, top_k * 6))
     bm25_max = max([s for _, s in bm25_hits], default=1.0)
     bm25_norm_pairs = [(idx, (score / bm25_max) if bm25_max > 0 else 0.0) for idx, score in bm25_hits]
     bm25_id_to_norm, bm25_id_to_text, bm25_id_to_meta = {}, {}, {}
     for idx, nscore in bm25_norm_pairs:
         d = bm25_docs[idx]
     union_ids = set(sem_ids) | set(bm25_id_to_norm.keys())
+    records: List[Tuple[str, float, float, str, Dict[str, Any]]] = []
     for cid in union_ids:
         if cid in sem_ids:
             pos = sem_ids.index(cid)
         text = sem_text if sem_text else bm25_text
         meta = sem_meta if sem_meta else bm25_meta
+        base = 0.55 * sem_sim + 0.45 * bm25_sim
+        overlap = 0.30 * _meta_overlap(meta, q_terms)
+        doc_mods = [m.strip() for m in (meta.get("module_tags") or "").split(",") if m.strip()]
+        mod_overlap = len(set(doc_mods) & set(user_modules))
+        mod_bonus = 0.60 * mod_overlap if mod_overlap else -0.50
+        doc_action = (meta.get("action_tag") or "neutral").lower()
+        action_bonus = 0.0
+        if user_actions:
+            if doc_action in user_actions:
+                action_bonus += 1.40
+            for ua in user_actions:
+                for bad in ACTION_CONFLICTS.get(ua, []):
+                    if doc_action == bad:
+                        action_bonus -= 1.40
+        sec_low = (meta.get("section") or "").lower()
+        title_low = (meta.get("title") or "").lower()
+        head_bonus = 0.0
+        if any(w in sec_low for w in APPT_WORDS) or any(w in title_low for w in APPT_WORDS):
+            if "appointments" in user_modules:
+                head_bonus += 0.40
+        final = base + overlap + mod_bonus + action_bonus + head_bonus
+        records.append((cid, final, (sem_dist if sem_dist is not None else 999.0), text, meta))
+    records.sort(key=lambda x: x[1], reverse=True)
+    top = records[:top_k]
     return {
+        "documents": [t[3] for t in top],
+        "metadatas": [t[4] for t in top],
+        "distances": [t[2] for t in top],
+        "ids": [t[0] for t in top],
+        "combined_scores": [t[1] for t in top],
+        "best_doc": (top[0][4].get("filename") if top else None),
+        "user_actions": user_actions,
+        "user_modules": user_modules,
     }
+# -------------------------- Section helpers -------------------------
 def get_section_text(filename: str, section: str) -> str:
     texts: List[str] = []
     for d in bm25_docs:
             t = (d.get("text") or "").strip()
             if t:
                 texts.append(t)
+    return "".join(texts).strip()
 def get_best_steps_section_text(filename: str) -> str:
     texts: List[str] = []
             t = (d.get("text") or "").strip()
             if t:
                 texts.append(t)
+    return "".join(texts).strip()
 def get_best_errors_section_text(filename: str) -> str:
     texts: List[str] = []
     for d in bm25_docs:
         m = d.get("meta", {})
         sec = (m.get("section") or "").lower()
+        topics = (m.get("module_tags") or "")
+        if m.get("filename") == filename and (m.get("intent_tag") == "errors" or "error" in sec):
             t = (d.get("text") or "").strip()
             if t:
                 texts.append(t)
+    return "".join(texts).strip()
+# -------------------------- Admin helpers ---------------------------
 def get_kb_runtime_info() -> Dict[str, Any]:
     return {
         "chroma_path": CHROMA_PATH,
         "bm25_ready": bm25_ready,
     }
 def reset_kb(folder_path: str) -> Dict[str, Any]:
     result = {"status": "OK", "message": "KB reset and re-ingested"}
     try:
             pass
         global collection
         collection = client.get_or_create_collection(name="knowledge_base")
         try:
             if os.path.isfile(BM25_INDEX_FILE):
                 os.remove(BM25_INDEX_FILE)
         except Exception as e:
             result.setdefault("warnings", []).append(f"bm25 index delete: {e}")
         os.makedirs(CHROMA_PATH, exist_ok=True)
         ingest_documents(folder_path)
         result["info"] = get_kb_runtime_info()