Spaces:

ChatbotNova
/

Chatbot-Backend

Sleeping

App Files Files Community

srilakshu012456 commited on Dec 23, 2025

Commit

7a1faf2

verified ·

1 Parent(s): 192969d

Update services/kb_creation.py

Browse files

Files changed (1) hide show

services/kb_creation.py +44 -19

services/kb_creation.py CHANGED Viewed

@@ -13,14 +13,13 @@ client = chromadb.PersistentClient(path=CHROMA_PATH)
 collection = client.get_or_create_collection(name="knowledge_base")
 # --------------------------- Embedding model ---------------------------
-# model = SentenceTransformer('paraphrase-multilingual-MiniLM-L12-v2')  # optional
 model = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2")
 # --------------------------- BM25 (lightweight) ---------------------------
 BM25_INDEX_FILE = os.path.join(CHROMA_PATH, "bm25_index.pkl")
-bm25_docs: List[Dict[str, Any]] = []  # each: {id, text, tokens, tf, length, meta}
-bm25_inverted: Dict[str, List[int]] = {}  # term -> list of doc indices in bm25_docs
-bm25_df: Dict[str, int] = {}  # term -> document frequency
 bm25_avgdl: float = 0.0
 bm25_ready: bool = False
 BM25_K1 = 1.5
@@ -228,12 +227,14 @@ def bm25_search(query: str, top_k: int = 50) -> List[Tuple[int, float]]:
     q_terms = _tokenize(norm)
     if not q_terms:
         return []
     candidates = set()
     for t in q_terms:
         for idx in bm25_inverted.get(t, []):
             candidates.add(idx)
     if not candidates:
         candidates = set(range(len(bm25_docs)))
     scored = []
     for idx in candidates:
         s = _bm25_score_for_doc(q_terms, idx)
@@ -248,7 +249,7 @@ def search_knowledge_base(query: str, top_k: int = 10) -> dict:
     res = collection.query(
         query_embeddings=[query_embedding],
         n_results=top_k,
-        include=['documents', 'metadatas', 'distances']  # no 'ids' here
     )
     docs_ll = res.get("documents", [[]]) or [[]]
     metas_ll = res.get("metadatas", [[]]) or [[]]
@@ -329,9 +330,6 @@ def _meta_overlap(meta: Dict[str, Any], q_terms: List[str]) -> float:
     return inter / max(1, len(qset))
 def _action_weight(text: str, actions: List[str]) -> float:
-    """
-    Boost if text contains target action verb(s); penalize if text dominated by other actions.
-    """
     if not actions:
         return 0.0
     t = (text or "").lower()
@@ -339,8 +337,7 @@ def _action_weight(text: str, actions: List[str]) -> float:
     for act in actions:
         for syn in ACTION_SYNONYMS.get(act, [act]):
             if syn in t:
-                score += 1.0  # boost for each matching synonym
-    # Penalize conflicting actions: e.g., query 'create' but text has 'delete' heavily
     conflicts = {"create": ["delete"], "delete": ["create"], "update": ["delete"], "navigate": [], "perform": []}
     for act in actions:
         for bad in conflicts.get(act, []):
@@ -350,14 +347,10 @@ def _action_weight(text: str, actions: List[str]) -> float:
     return score
 def hybrid_search_knowledge_base(query: str, top_k: int = 10, alpha: float = 0.6, beta: float = 0.4) -> dict:
-    """
-    final = alpha * semantic_sim + beta * bm25_norm + gamma * meta_overlap + delta * intent_boost + epsilon * action_weight
-    + document-level voting prior.
-    """
     norm_query = _normalize_query(query)
     q_terms = _tokenize(norm_query)
     user_intent = _detect_user_intent(query)
-    actions = _extract_actions(query)  # NEW
     sem_res = search_knowledge_base(norm_query, top_k=max(top_k, 30))
     sem_docs = sem_res.get("documents", [])
@@ -388,9 +381,9 @@ def hybrid_search_knowledge_base(query: str, top_k: int = 10, alpha: float = 0.6
     union_ids = set(sem_ids) | set(bm25_id_to_norm.keys())
-    gamma = 0.25  # meta overlap
-    delta = 0.35  # intent boost
-    epsilon = 0.30  # action weight
     combined_records_ext: List[Tuple[str, float, float, str, Dict[str, Any], float, float, float]] = []
     for cid in union_ids:
@@ -412,7 +405,7 @@ def hybrid_search_knowledge_base(query: str, top_k: int = 10, alpha: float = 0.6
         m_overlap = _meta_overlap(meta, q_terms)
         intent_boost = _intent_weight(meta, user_intent)
-        act_wt = _action_weight(text, actions)  # NEW
         final_score = alpha * sem_sim + beta * bm25_sim + gamma * m_overlap + delta * intent_boost + epsilon * act_wt
@@ -469,3 +462,35 @@ def hybrid_search_knowledge_base(query: str, top_k: int = 10, alpha: float = 0.6
         "user_intent": user_intent,
         "actions": actions,
     }

 collection = client.get_or_create_collection(name="knowledge_base")
 # --------------------------- Embedding model ---------------------------
 model = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2")
 # --------------------------- BM25 (lightweight) ---------------------------
 BM25_INDEX_FILE = os.path.join(CHROMA_PATH, "bm25_index.pkl")
+bm25_docs: List[Dict[str, Any]] = []
+bm25_inverted: Dict[str, List[int]] = {}
+bm25_df: Dict[str, int] = {}
 bm25_avgdl: float = 0.0
 bm25_ready: bool = False
 BM25_K1 = 1.5
     q_terms = _tokenize(norm)
     if not q_terms:
         return []
     candidates = set()
     for t in q_terms:
         for idx in bm25_inverted.get(t, []):
             candidates.add(idx)
     if not candidates:
         candidates = set(range(len(bm25_docs)))
     scored = []
     for idx in candidates:
         s = _bm25_score_for_doc(q_terms, idx)
     res = collection.query(
         query_embeddings=[query_embedding],
         n_results=top_k,
+        include=['documents', 'metadatas', 'distances']
     )
     docs_ll = res.get("documents", [[]]) or [[]]
     metas_ll = res.get("metadatas", [[]]) or [[]]
     return inter / max(1, len(qset))
 def _action_weight(text: str, actions: List[str]) -> float:
     if not actions:
         return 0.0
     t = (text or "").lower()
     for act in actions:
         for syn in ACTION_SYNONYMS.get(act, [act]):
             if syn in t:
+                score += 1.0
     conflicts = {"create": ["delete"], "delete": ["create"], "update": ["delete"], "navigate": [], "perform": []}
     for act in actions:
         for bad in conflicts.get(act, []):
     return score
 def hybrid_search_knowledge_base(query: str, top_k: int = 10, alpha: float = 0.6, beta: float = 0.4) -> dict:
     norm_query = _normalize_query(query)
     q_terms = _tokenize(norm_query)
     user_intent = _detect_user_intent(query)
+    actions = _extract_actions(query)
     sem_res = search_knowledge_base(norm_query, top_k=max(top_k, 30))
     sem_docs = sem_res.get("documents", [])
     union_ids = set(sem_ids) | set(bm25_id_to_norm.keys())
+    gamma = 0.25
+    delta = 0.35
+    epsilon = 0.30
     combined_records_ext: List[Tuple[str, float, float, str, Dict[str, Any], float, float, float]] = []
     for cid in union_ids:
         m_overlap = _meta_overlap(meta, q_terms)
         intent_boost = _intent_weight(meta, user_intent)
+        act_wt = _action_weight(text, actions)
         final_score = alpha * sem_sim + beta * bm25_sim + gamma * m_overlap + delta * intent_boost + epsilon * act_wt
         "user_intent": user_intent,
         "actions": actions,
     }
+# --- Runtime info & reset helpers (optional admin) ---
+def get_kb_runtime_info() -> Dict[str, Any]:
+    return {
+        "chroma_path": CHROMA_PATH,
+        "chroma_exists": os.path.isdir(CHROMA_PATH),
+        "bm25_index_file": BM25_INDEX_FILE,
+        "bm25_index_exists": os.path.isfile(BM25_INDEX_FILE),
+        "collection_count": collection.count(),
+        "bm25_ready": bm25_ready,
+    }
+def reset_kb(folder_path: str) -> Dict[str, Any]:
+    result = {"status": "OK", "message": "KB reset and re-ingested"}
+    try:
+        try:
+            client.delete_collection(name="knowledge_base")
+        except Exception:
+            pass
+        global collection
+        collection = client.get_or_create_collection(name="knowledge_base")
+        try:
+            if os.path.isfile(BM25_INDEX_FILE):
+                os.remove(BM25_INDEX_FILE)
+        except Exception as e:
+            result.setdefault("warnings", []).append(f"bm25 index delete: {e}")
+        os.makedirs(CHROMA_PATH, exist_ok=True)
+        ingest_documents(folder_path)
+        result["info"] = get_kb_runtime_info()
+        return result
+    except Exception as e:
+        return {"status": "ERROR", "error": f"{e}", "info": get_kb_runtime_info()}