srilakshu012456 commited on
Commit
cb77267
·
verified ·
1 Parent(s): 256fb66

Update services/kb_creation.py

Browse files
Files changed (1) hide show
  1. services/kb_creation.py +130 -71
services/kb_creation.py CHANGED
@@ -1,3 +1,5 @@
 
 
1
  import os
2
  import re
3
  import pickle
@@ -6,15 +8,15 @@ from docx import Document
6
  from sentence_transformers import SentenceTransformer
7
  import chromadb
8
 
9
- # --------------------------- ChromaDB setup ---------------------------
10
  CHROMA_PATH = os.path.join(os.getcwd(), "chroma_db")
11
  client = chromadb.PersistentClient(path=CHROMA_PATH)
12
  collection = client.get_or_create_collection(name="knowledge_base")
13
 
14
- # --------------------------- Embedding model ---------------------------
15
  model = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2")
16
 
17
- # --------------------------- BM25 (lightweight) ---------------------------
18
  BM25_INDEX_FILE = os.path.join(CHROMA_PATH, "bm25_index.pkl")
19
  bm25_docs: List[Dict[str, Any]] = []
20
  bm25_inverted: Dict[str, List[int]] = {}
@@ -24,7 +26,7 @@ bm25_ready: bool = False
24
  BM25_K1 = 1.5
25
  BM25_B = 0.75
26
 
27
- # --------------------------- Utilities ---------------------------
28
  def _tokenize(text: str) -> List[str]:
29
  if not text:
30
  return []
@@ -34,18 +36,13 @@ def _tokenize(text: str) -> List[str]:
34
  def _normalize_query(q: str) -> str:
35
  q = (q or "").strip().lower()
36
  q = re.sub(r"[^\w\s]", " ", q)
37
- q = re.sub(
38
- r"\b(facing|get|getting|got|seeing|receiving|encountered|having|observing|issue|problem)\b",
39
- " ",
40
- q,
41
- )
42
  q = re.sub(r"\s+", " ", q).strip()
43
  return q
44
 
45
  def _tokenize_meta_value(val: Optional[str]) -> List[str]:
46
  return _tokenize(val or "")
47
 
48
- # --------------------------- DOCX parsing & chunking ---------------------------
49
  def _split_by_sections(doc: Document) -> List[Tuple[str, List[str]]]:
50
  sections: List[Tuple[str, List[str]]] = []
51
  current_title = None
@@ -78,49 +75,88 @@ def _chunk_text_with_context(doc_title: str, section_title: str, paragraphs: Lis
78
  for i in range(0, len(words), max_words):
79
  chunk_body = ' '.join(words[i:i + max_words]).strip()
80
  if chunk_body:
81
- chunks.append(chunk_body) # no doc/section headers inside text
82
  if not chunks:
83
  chunks = [body]
84
  return chunks
85
 
86
- # --------------------------- Intent tagging (auto) ---------------------------
 
 
 
 
 
 
 
 
 
 
 
 
87
  def _infer_intent_tag(section_title: str) -> str:
88
  st = (section_title or "").lower()
89
- if any(k in st for k in ["process steps", "procedure", "how to", "workflow", "instructions"]):
90
  return "steps"
91
- if any(k in st for k in ["common errors", "resolution", "troubleshooting"]):
92
  return "errors"
93
- if any(k in st for k in ["pre-requisites", "prerequisites"]):
94
  return "prereqs"
95
  if any(k in st for k in ["purpose", "overview", "introduction"]):
96
  return "purpose"
97
  return "neutral"
98
 
99
- # --------------------------- Ingestion ---------------------------
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
100
  def ingest_documents(folder_path: str) -> None:
101
- print(f"📂 Checking folder: {folder_path}")
102
  files = [f for f in os.listdir(folder_path) if f.lower().endswith('.docx')]
103
- print(f"Found {len(files)} Word files: {files}")
104
  if not files:
105
- print("⚠️ No .docx files found. Please check the folder path.")
106
  return
107
-
108
  global bm25_docs, bm25_inverted, bm25_df, bm25_avgdl, bm25_ready
109
  bm25_docs, bm25_inverted, bm25_df = [], {}, {}
110
  bm25_avgdl, bm25_ready = 0.0, False
111
-
112
  for file in files:
113
  file_path = os.path.join(folder_path, file)
114
  doc_title = os.path.splitext(file)[0]
115
  doc = Document(file_path)
116
  sections = _split_by_sections(doc)
117
  total_chunks = 0
118
-
119
  for s_idx, (section_title, paras) in enumerate(sections):
120
  chunks = _chunk_text_with_context(doc_title, section_title, paras, max_words=900)
121
  total_chunks += len(chunks)
122
- intent_tag = _infer_intent_tag(section_title)
123
  for c_idx, chunk in enumerate(chunks):
 
 
 
 
 
 
 
124
  embedding = model.encode(chunk).tolist()
125
  doc_id = f"{file}:{s_idx}:{c_idx}"
126
  meta = {
@@ -129,7 +165,8 @@ def ingest_documents(folder_path: str) -> None:
129
  "chunk_index": c_idx,
130
  "title": doc_title,
131
  "collection": "SOP",
132
- "intent_tag": intent_tag, # NEW
 
133
  }
134
  try:
135
  collection.add(ids=[doc_id], embeddings=[embedding], documents=[chunk], metadatas=[meta])
@@ -138,29 +175,31 @@ def ingest_documents(folder_path: str) -> None:
138
  collection.delete(ids=[doc_id])
139
  collection.add(ids=[doc_id], embeddings=[embedding], documents=[chunk], metadatas=[meta])
140
  except Exception as e2:
141
- print(f" Upsert failed for {doc_id}: {e2}")
142
-
143
  tokens = _tokenize(chunk)
144
  tf: Dict[str, int] = {}
145
- for t in tokens:
146
- tf[t] = tf.get(t, 0) + 1
147
  idx = len(bm25_docs)
148
- bm25_docs.append({"id": doc_id, "text": chunk, "tokens": tokens, "tf": tf, "length": len(tokens), "meta": meta})
149
-
 
 
 
 
 
 
150
  seen = set()
151
  for term in tf.keys():
152
  bm25_inverted.setdefault(term, []).append(idx)
153
  if term not in seen:
154
  bm25_df[term] = bm25_df.get(term, 0) + 1
155
  seen.add(term)
156
-
157
- print(f"📄 Ingested {file} → {total_chunks} chunks")
158
-
159
  N = len(bm25_docs)
160
  if N > 0:
161
  bm25_avgdl = sum(d["length"] for d in bm25_docs) / float(N)
162
  bm25_ready = True
163
-
164
  payload = {
165
  "bm25_docs": bm25_docs,
166
  "bm25_inverted": bm25_inverted,
@@ -172,9 +211,10 @@ def ingest_documents(folder_path: str) -> None:
172
  os.makedirs(CHROMA_PATH, exist_ok=True)
173
  with open(BM25_INDEX_FILE, "wb") as f:
174
  pickle.dump(payload, f)
175
- print(f" BM25 index saved: {BM25_INDEX_FILE}")
176
- print(f" Documents ingested. Total entries in Chroma: {collection.count()}")
177
 
 
178
  def _load_bm25_index() -> None:
179
  global bm25_docs, bm25_inverted, bm25_df, bm25_avgdl, bm25_ready
180
  if not os.path.exists(BM25_INDEX_FILE):
@@ -188,13 +228,13 @@ def _load_bm25_index() -> None:
188
  bm25_avgdl = payload.get("bm25_avgdl", 0.0)
189
  bm25_ready = len(bm25_docs) > 0
190
  if bm25_ready:
191
- print(f" BM25 index loaded: {BM25_INDEX_FILE} (docs={len(bm25_docs)})")
192
  except Exception as e:
193
- print(f"⚠️ Could not load BM25 index: {e}")
194
 
195
  _load_bm25_index()
196
 
197
- # --------------------------- BM25 search ---------------------------
198
  def _bm25_score_for_doc(query_terms: List[str], doc_idx: int) -> float:
199
  if not bm25_ready or doc_idx < 0 or doc_idx >= len(bm25_docs):
200
  return 0.0
@@ -240,7 +280,7 @@ def bm25_search(query: str, top_k: int = 50) -> List[Tuple[int, float]]:
240
  scored.sort(key=lambda x: x[1], reverse=True)
241
  return scored[:top_k]
242
 
243
- # --------------------------- Semantic-only ---------------------------
244
  def search_knowledge_base(query: str, top_k: int = 10) -> dict:
245
  query_embedding = model.encode(query).tolist()
246
  res = collection.query(
@@ -248,16 +288,14 @@ def search_knowledge_base(query: str, top_k: int = 10) -> dict:
248
  n_results=top_k,
249
  include=['documents', 'metadatas', 'distances']
250
  )
251
- docs_ll = res.get("documents", [[]]) or [[]]
252
- metas_ll = res.get("metadatas", [[]]) or [[]]
253
- dists_ll = res.get("distances", [[]]) or [[]]
254
- ids_ll = res.get("ids", [[]]) or [[]]
255
-
256
  documents = docs_ll[0] if docs_ll else []
257
  metadatas = metas_ll[0] if metas_ll else []
258
  distances = dists_ll[0] if dists_ll else []
259
  ids = ids_ll[0] if ids_ll else []
260
-
261
  if not ids and documents:
262
  synthesized = []
263
  for i, m in enumerate(metadatas):
@@ -266,9 +304,7 @@ def search_knowledge_base(query: str, top_k: int = 10) -> dict:
266
  idx = (m or {}).get("chunk_index", i)
267
  synthesized.append(f"{fn}:{sec}:{idx}")
268
  ids = synthesized
269
-
270
- print(f"🔎 KB search → {len(documents)} docs (top_k={top_k}); "
271
- f"first distance: {distances[0] if distances else 'n/a'}; ids={len(ids)}")
272
  return {
273
  "documents": documents,
274
  "metadatas": metadatas,
@@ -276,21 +312,27 @@ def search_knowledge_base(query: str, top_k: int = 10) -> dict:
276
  "ids": ids,
277
  }
278
 
279
- # --------------------------- Hybrid (BM25 + Embeddings + Intent + Action) ---------------------------
280
  ACTION_SYNONYMS = {
281
  "create": ["create", "creation", "add", "new", "generate"],
282
  "update": ["update", "modify", "change", "edit"],
283
  "delete": ["delete", "remove"],
284
  "navigate": ["navigate", "go to", "open"],
285
- # NOTE: 'perform' REMOVED to avoid wrong boosts like Appointment "performed..."
286
  }
287
 
 
 
 
 
 
 
 
288
  def _detect_user_intent(query: str) -> str:
289
  q = (query or "").lower()
290
- if any(k in q for k in ["steps", "procedure", "how to", "navigate", "perform", "do", "process"]):
291
- return "steps"
292
- if any(k in q for k in ["error", "issue", "fail", "not working", "resolution", "fix"]):
293
  return "errors"
 
 
294
  if any(k in q for k in ["pre-requisite", "prerequisites", "requirement", "requirements"]):
295
  return "prereqs"
296
  if any(k in q for k in ["purpose", "overview", "introduction"]):
@@ -313,13 +355,18 @@ def _intent_weight(meta: dict, user_intent: str) -> float:
313
  return 1.0
314
  if tag in ["purpose", "prereqs"] and user_intent in ["steps", "errors"]:
315
  return -0.6
 
 
 
 
316
  return -0.2
317
 
318
  def _meta_overlap(meta: Dict[str, Any], q_terms: List[str]) -> float:
319
  fn_tokens = _tokenize_meta_value(meta.get("filename"))
320
  title_tokens = _tokenize_meta_value(meta.get("title"))
321
  section_tokens = _tokenize_meta_value(meta.get("section"))
322
- meta_tokens = set(fn_tokens + title_tokens + section_tokens)
 
323
  if not meta_tokens or not q_terms:
324
  return 0.0
325
  qset = set(q_terms)
@@ -368,7 +415,6 @@ def hybrid_search_knowledge_base(query: str, top_k: int = 10, alpha: float = 0.6
368
  bm25_hits = bm25_search(norm_query, top_k=max(50, top_k * 5))
369
  bm25_max = max([s for _, s in bm25_hits], default=1.0)
370
  bm25_norm_pairs = [(idx, (score / bm25_max) if bm25_max > 0 else 0.0) for idx, score in bm25_hits]
371
-
372
  bm25_id_to_norm, bm25_id_to_text, bm25_id_to_meta = {}, {}, {}
373
  for idx, nscore in bm25_norm_pairs:
374
  d = bm25_docs[idx]
@@ -378,8 +424,8 @@ def hybrid_search_knowledge_base(query: str, top_k: int = 10, alpha: float = 0.6
378
 
379
  union_ids = set(sem_ids) | set(bm25_id_to_norm.keys())
380
 
381
- gamma = 0.25 # meta overlap
382
- delta = 0.35 # intent boost
383
  epsilon = 0.30 # action weight
384
 
385
  combined_records_ext: List[Tuple[str, float, float, str, Dict[str, Any], float, float, float]] = []
@@ -392,20 +438,15 @@ def hybrid_search_knowledge_base(query: str, top_k: int = 10, alpha: float = 0.6
392
  sem_meta = sem_metas[pos] if pos < len(sem_metas) else {}
393
  else:
394
  sem_sim, sem_dist, sem_text, sem_meta = 0.0, None, "", {}
395
-
396
  bm25_sim = bm25_id_to_norm.get(cid, 0.0)
397
  bm25_text = bm25_id_to_text.get(cid, "")
398
  bm25_meta = bm25_id_to_meta.get(cid, {})
399
-
400
  text = sem_text if sem_text else bm25_text
401
  meta = sem_meta if sem_meta else bm25_meta
402
-
403
  m_overlap = _meta_overlap(meta, q_terms)
404
  intent_boost = _intent_weight(meta, user_intent)
405
  act_wt = _action_weight(text, actions)
406
-
407
  final_score = alpha * sem_sim + beta * bm25_sim + gamma * m_overlap + delta * intent_boost + epsilon * act_wt
408
-
409
  combined_records_ext.append(
410
  (cid, final_score, (sem_dist if sem_dist is not None else 999.0), text, meta, m_overlap, intent_boost, act_wt)
411
  )
@@ -423,7 +464,9 @@ def hybrid_search_knowledge_base(query: str, top_k: int = 10, alpha: float = 0.6
423
  total_intent = sum(max(0.0, r[6]) for r in recs)
424
  total_action = sum(max(0.0, r[7]) for r in recs)
425
  total_penalty = sum(min(0.0, r[6]) for r in recs) + sum(min(0.0, r[7]) for r in recs)
426
- return total_score + 0.4 * total_overlap + 0.6 * total_intent + 0.5 * total_action + 0.3 * total_penalty
 
 
427
 
428
  best_doc, best_doc_prior = None, -1.0
429
  for fn, recs in doc_groups.items():
@@ -441,7 +484,6 @@ def hybrid_search_knowledge_base(query: str, top_k: int = 10, alpha: float = 0.6
441
 
442
  reordered = best_recs + other_recs
443
  top = reordered[:top_k]
444
-
445
  documents = [t[3] for t in top]
446
  metadatas = [t[4] for t in top]
447
  distances = [t[2] for t in top]
@@ -460,9 +502,8 @@ def hybrid_search_knowledge_base(query: str, top_k: int = 10, alpha: float = 0.6
460
  "actions": actions,
461
  }
462
 
463
- # --------------------------- Section fetch helpers (for full output) ---------------------------
464
  def get_section_text(filename: str, section: str) -> str:
465
- """Concatenate all chunk texts for a given filename+section."""
466
  texts: List[str] = []
467
  for d in bm25_docs:
468
  m = d.get("meta", {})
@@ -473,7 +514,6 @@ def get_section_text(filename: str, section: str) -> str:
473
  return "\n\n".join(texts).strip()
474
 
475
  def get_best_steps_section_text(filename: str) -> str:
476
- """Return combined text of all 'steps' sections in the given SOP (filename)."""
477
  texts: List[str] = []
478
  for d in bm25_docs:
479
  m = d.get("meta", {})
@@ -483,7 +523,27 @@ def get_best_steps_section_text(filename: str) -> str:
483
  texts.append(t)
484
  return "\n\n".join(texts).strip()
485
 
486
- # --- Admin helpers (optional; unchanged) ---
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
487
  def get_kb_runtime_info() -> Dict[str, Any]:
488
  return {
489
  "chroma_path": CHROMA_PATH,
@@ -513,5 +573,4 @@ def reset_kb(folder_path: str) -> Dict[str, Any]:
513
  result["info"] = get_kb_runtime_info()
514
  return result
515
  except Exception as e:
516
- return {"status": "ERROR", "error": f"{e}", "info": get_kb_runtime_info()}
517
-
 
1
+
2
+ # services/kb_creation.py
3
  import os
4
  import re
5
  import pickle
 
8
  from sentence_transformers import SentenceTransformer
9
  import chromadb
10
 
11
+ # ----------------------- ChromaDB setup -----------------------
12
  CHROMA_PATH = os.path.join(os.getcwd(), "chroma_db")
13
  client = chromadb.PersistentClient(path=CHROMA_PATH)
14
  collection = client.get_or_create_collection(name="knowledge_base")
15
 
16
+ # ----------------------- Embedding model -----------------------
17
  model = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2")
18
 
19
+ # ----------------------- BM25 (lightweight) -----------------------
20
  BM25_INDEX_FILE = os.path.join(CHROMA_PATH, "bm25_index.pkl")
21
  bm25_docs: List[Dict[str, Any]] = []
22
  bm25_inverted: Dict[str, List[int]] = {}
 
26
  BM25_K1 = 1.5
27
  BM25_B = 0.75
28
 
29
+ # ----------------------- Utilities -----------------------
30
  def _tokenize(text: str) -> List[str]:
31
  if not text:
32
  return []
 
36
  def _normalize_query(q: str) -> str:
37
  q = (q or "").strip().lower()
38
  q = re.sub(r"[^\w\s]", " ", q)
 
 
 
 
 
39
  q = re.sub(r"\s+", " ", q).strip()
40
  return q
41
 
42
  def _tokenize_meta_value(val: Optional[str]) -> List[str]:
43
  return _tokenize(val or "")
44
 
45
+ # ----------------------- DOCX parsing & chunking -----------------------
46
  def _split_by_sections(doc: Document) -> List[Tuple[str, List[str]]]:
47
  sections: List[Tuple[str, List[str]]] = []
48
  current_title = None
 
75
  for i in range(0, len(words), max_words):
76
  chunk_body = ' '.join(words[i:i + max_words]).strip()
77
  if chunk_body:
78
+ chunks.append(chunk_body)
79
  if not chunks:
80
  chunks = [body]
81
  return chunks
82
 
83
+ # ----------------------- Intent tagging -----------------------
84
+ SECTION_STEPS_HINTS = ["process steps", "procedure", "how to", "workflow", "instructions", "steps"]
85
+ SECTION_ERRORS_HINTS = ["common errors", "resolution", "troubleshooting", "known issues", "escalation", "escalation path", "permissions", "access"]
86
+
87
+ PERMISSION_TERMS = [
88
+ "permission", "permissions", "access", "access right", "authorization", "authorisation",
89
+ "role", "role access", "role mapping", "security", "security profile", "privilege", "insufficient",
90
+ "not allowed", "not authorized", "denied", "restrict"
91
+ ]
92
+
93
+ ERROR_TERMS = ["error", "issue", "fail", "failure", "not working", "cannot", "can't"]
94
+ STEP_VERBS = ["navigate", "select", "scan", "verify", "confirm", "print", "move", "complete", "click", "open", "choose", "enter", "update", "save", "delete", "create", "attach", "assign"]
95
+
96
  def _infer_intent_tag(section_title: str) -> str:
97
  st = (section_title or "").lower()
98
+ if any(k in st for k in SECTION_STEPS_HINTS):
99
  return "steps"
100
+ if any(k in st for k in SECTION_ERRORS_HINTS):
101
  return "errors"
102
+ if "pre" in st and "requisite" in st:
103
  return "prereqs"
104
  if any(k in st for k in ["purpose", "overview", "introduction"]):
105
  return "purpose"
106
  return "neutral"
107
 
108
+ def _derive_semantic_intent_from_text(text: str) -> Tuple[str, List[str]]:
109
+ """Return ('errors'|'steps'|'neutral', topic_tags) by scanning the text."""
110
+ t = (text or "").lower()
111
+ tags: List[str] = []
112
+ intent = "neutral"
113
+ # permissions/access first (override)
114
+ if any(term in t for term in PERMISSION_TERMS):
115
+ intent = "errors"
116
+ tags.append("permissions")
117
+ if "role" in t:
118
+ tags.append("role_access")
119
+ if "security" in t:
120
+ tags.append("security")
121
+ # generic errors
122
+ if intent == "neutral" and any(term in t for term in ERROR_TERMS):
123
+ intent = "errors"
124
+ tags.append("errors")
125
+ # steps indicators
126
+ if intent == "neutral" and any(v in t for v in STEP_VERBS):
127
+ intent = "steps"
128
+ tags.append("procedure")
129
+ return intent, list(set(tags))
130
+
131
+ # ----------------------- Ingestion -----------------------
132
  def ingest_documents(folder_path: str) -> None:
133
+ print(f"[KB] Checking folder: {folder_path}")
134
  files = [f for f in os.listdir(folder_path) if f.lower().endswith('.docx')]
135
+ print(f"[KB] Found {len(files)} Word files: {files}")
136
  if not files:
137
+ print("[KB] WARNING: No .docx files found. Please check the folder path.")
138
  return
 
139
  global bm25_docs, bm25_inverted, bm25_df, bm25_avgdl, bm25_ready
140
  bm25_docs, bm25_inverted, bm25_df = [], {}, {}
141
  bm25_avgdl, bm25_ready = 0.0, False
 
142
  for file in files:
143
  file_path = os.path.join(folder_path, file)
144
  doc_title = os.path.splitext(file)[0]
145
  doc = Document(file_path)
146
  sections = _split_by_sections(doc)
147
  total_chunks = 0
 
148
  for s_idx, (section_title, paras) in enumerate(sections):
149
  chunks = _chunk_text_with_context(doc_title, section_title, paras, max_words=900)
150
  total_chunks += len(chunks)
151
+ base_intent = _infer_intent_tag(section_title)
152
  for c_idx, chunk in enumerate(chunks):
153
+ derived_intent, topic_tags = _derive_semantic_intent_from_text(chunk)
154
+ # choose strongest intent: errors overrides steps
155
+ final_intent = base_intent
156
+ if derived_intent == "errors":
157
+ final_intent = "errors"
158
+ elif base_intent == "neutral" and derived_intent in ("steps", "prereqs"):
159
+ final_intent = derived_intent
160
  embedding = model.encode(chunk).tolist()
161
  doc_id = f"{file}:{s_idx}:{c_idx}"
162
  meta = {
 
165
  "chunk_index": c_idx,
166
  "title": doc_title,
167
  "collection": "SOP",
168
+ "intent_tag": final_intent,
169
+ "topic_tags": topic_tags,
170
  }
171
  try:
172
  collection.add(ids=[doc_id], embeddings=[embedding], documents=[chunk], metadatas=[meta])
 
175
  collection.delete(ids=[doc_id])
176
  collection.add(ids=[doc_id], embeddings=[embedding], documents=[chunk], metadatas=[meta])
177
  except Exception as e2:
178
+ print(f"[KB] ERROR: Upsert failed for {doc_id}: {e2}")
 
179
  tokens = _tokenize(chunk)
180
  tf: Dict[str, int] = {}
181
+ for tkn in tokens:
182
+ tf[tkn] = tf.get(tkn, 0) + 1
183
  idx = len(bm25_docs)
184
+ bm25_docs.append({
185
+ "id": doc_id,
186
+ "text": chunk,
187
+ "tokens": tokens,
188
+ "tf": tf,
189
+ "length": len(tokens),
190
+ "meta": meta,
191
+ })
192
  seen = set()
193
  for term in tf.keys():
194
  bm25_inverted.setdefault(term, []).append(idx)
195
  if term not in seen:
196
  bm25_df[term] = bm25_df.get(term, 0) + 1
197
  seen.add(term)
198
+ print(f"[KB] Ingested {file} → {total_chunks} chunks")
 
 
199
  N = len(bm25_docs)
200
  if N > 0:
201
  bm25_avgdl = sum(d["length"] for d in bm25_docs) / float(N)
202
  bm25_ready = True
 
203
  payload = {
204
  "bm25_docs": bm25_docs,
205
  "bm25_inverted": bm25_inverted,
 
211
  os.makedirs(CHROMA_PATH, exist_ok=True)
212
  with open(BM25_INDEX_FILE, "wb") as f:
213
  pickle.dump(payload, f)
214
+ print(f"[KB] BM25 index saved: {BM25_INDEX_FILE}")
215
+ print(f"[KB] Documents ingested. Total entries in Chroma: {collection.count()}")
216
 
217
+ # ----------------------- BM25 load -----------------------
218
  def _load_bm25_index() -> None:
219
  global bm25_docs, bm25_inverted, bm25_df, bm25_avgdl, bm25_ready
220
  if not os.path.exists(BM25_INDEX_FILE):
 
228
  bm25_avgdl = payload.get("bm25_avgdl", 0.0)
229
  bm25_ready = len(bm25_docs) > 0
230
  if bm25_ready:
231
+ print(f"[KB] BM25 index loaded: {BM25_INDEX_FILE} (docs={len(bm25_docs)})")
232
  except Exception as e:
233
+ print(f"[KB] WARNING: Could not load BM25 index: {e}")
234
 
235
  _load_bm25_index()
236
 
237
+ # ----------------------- BM25 search -----------------------
238
  def _bm25_score_for_doc(query_terms: List[str], doc_idx: int) -> float:
239
  if not bm25_ready or doc_idx < 0 or doc_idx >= len(bm25_docs):
240
  return 0.0
 
280
  scored.sort(key=lambda x: x[1], reverse=True)
281
  return scored[:top_k]
282
 
283
+ # ----------------------- Semantic-only -----------------------
284
  def search_knowledge_base(query: str, top_k: int = 10) -> dict:
285
  query_embedding = model.encode(query).tolist()
286
  res = collection.query(
 
288
  n_results=top_k,
289
  include=['documents', 'metadatas', 'distances']
290
  )
291
+ docs_ll = res.get("documents", [[ ]]) or [[ ]]
292
+ metas_ll = res.get("metadatas", [[ ]]) or [[ ]]
293
+ dists_ll = res.get("distances", [[ ]]) or [[ ]]
294
+ ids_ll = res.get("ids", [[ ]]) or [[ ]]
 
295
  documents = docs_ll[0] if docs_ll else []
296
  metadatas = metas_ll[0] if metas_ll else []
297
  distances = dists_ll[0] if dists_ll else []
298
  ids = ids_ll[0] if ids_ll else []
 
299
  if not ids and documents:
300
  synthesized = []
301
  for i, m in enumerate(metadatas):
 
304
  idx = (m or {}).get("chunk_index", i)
305
  synthesized.append(f"{fn}:{sec}:{idx}")
306
  ids = synthesized
307
+ print(f"[KB] search → {len(documents)} docs (top_k={top_k}); first distance: {distances[0] if distances else 'n/a'}; ids={len(ids)}")
 
 
308
  return {
309
  "documents": documents,
310
  "metadatas": metadatas,
 
312
  "ids": ids,
313
  }
314
 
315
+ # ----------------------- Hybrid search helpers -----------------------
316
  ACTION_SYNONYMS = {
317
  "create": ["create", "creation", "add", "new", "generate"],
318
  "update": ["update", "modify", "change", "edit"],
319
  "delete": ["delete", "remove"],
320
  "navigate": ["navigate", "go to", "open"],
 
321
  }
322
 
323
+ ERROR_INTENT_TERMS = [
324
+ "error", "issue", "fail", "not working", "resolution", "fix",
325
+ "permission", "permissions", "access", "no access", "authorization", "authorisation",
326
+ "role", "role mapping", "not authorized", "permission denied", "insufficient privileges",
327
+ "escalation", "escalation path", "access right"
328
+ ]
329
+
330
  def _detect_user_intent(query: str) -> str:
331
  q = (query or "").lower()
332
+ if any(k in q for k in ERROR_INTENT_TERMS):
 
 
333
  return "errors"
334
+ if any(k in q for k in ["steps", "procedure", "how to", "navigate", "process", "do", "perform"]):
335
+ return "steps"
336
  if any(k in q for k in ["pre-requisite", "prerequisites", "requirement", "requirements"]):
337
  return "prereqs"
338
  if any(k in q for k in ["purpose", "overview", "introduction"]):
 
355
  return 1.0
356
  if tag in ["purpose", "prereqs"] and user_intent in ["steps", "errors"]:
357
  return -0.6
358
+ st = (meta or {}).get("section", "").lower()
359
+ topics = (meta or {}).get("topic_tags", []) or []
360
+ if user_intent == "errors" and (any(k in st for k in ["escalation", "permissions", "access", "known issues"]) or ("permissions" in topics)):
361
+ return 0.7
362
  return -0.2
363
 
364
  def _meta_overlap(meta: Dict[str, Any], q_terms: List[str]) -> float:
365
  fn_tokens = _tokenize_meta_value(meta.get("filename"))
366
  title_tokens = _tokenize_meta_value(meta.get("title"))
367
  section_tokens = _tokenize_meta_value(meta.get("section"))
368
+ topic_tokens = _tokenize_meta_value(' '.join((meta.get("topic_tags") or [])))
369
+ meta_tokens = set(fn_tokens + title_tokens + section_tokens + topic_tokens)
370
  if not meta_tokens or not q_terms:
371
  return 0.0
372
  qset = set(q_terms)
 
415
  bm25_hits = bm25_search(norm_query, top_k=max(50, top_k * 5))
416
  bm25_max = max([s for _, s in bm25_hits], default=1.0)
417
  bm25_norm_pairs = [(idx, (score / bm25_max) if bm25_max > 0 else 0.0) for idx, score in bm25_hits]
 
418
  bm25_id_to_norm, bm25_id_to_text, bm25_id_to_meta = {}, {}, {}
419
  for idx, nscore in bm25_norm_pairs:
420
  d = bm25_docs[idx]
 
424
 
425
  union_ids = set(sem_ids) | set(bm25_id_to_norm.keys())
426
 
427
+ gamma = 0.30 # meta overlap
428
+ delta = 0.45 # intent boost (stronger for errors)
429
  epsilon = 0.30 # action weight
430
 
431
  combined_records_ext: List[Tuple[str, float, float, str, Dict[str, Any], float, float, float]] = []
 
438
  sem_meta = sem_metas[pos] if pos < len(sem_metas) else {}
439
  else:
440
  sem_sim, sem_dist, sem_text, sem_meta = 0.0, None, "", {}
 
441
  bm25_sim = bm25_id_to_norm.get(cid, 0.0)
442
  bm25_text = bm25_id_to_text.get(cid, "")
443
  bm25_meta = bm25_id_to_meta.get(cid, {})
 
444
  text = sem_text if sem_text else bm25_text
445
  meta = sem_meta if sem_meta else bm25_meta
 
446
  m_overlap = _meta_overlap(meta, q_terms)
447
  intent_boost = _intent_weight(meta, user_intent)
448
  act_wt = _action_weight(text, actions)
 
449
  final_score = alpha * sem_sim + beta * bm25_sim + gamma * m_overlap + delta * intent_boost + epsilon * act_wt
 
450
  combined_records_ext.append(
451
  (cid, final_score, (sem_dist if sem_dist is not None else 999.0), text, meta, m_overlap, intent_boost, act_wt)
452
  )
 
464
  total_intent = sum(max(0.0, r[6]) for r in recs)
465
  total_action = sum(max(0.0, r[7]) for r in recs)
466
  total_penalty = sum(min(0.0, r[6]) for r in recs) + sum(min(0.0, r[7]) for r in recs)
467
+ esc_weight = 0.3 if any("escalation" in (r[4] or {}).get("section", "").lower() for r in recs) else 0.0
468
+ perm_weight = 0.3 if any("permissions" in ((r[4] or {}).get("topic_tags") or []) for r in recs) else 0.0
469
+ return total_score + 0.4 * total_overlap + 0.7 * total_intent + 0.5 * total_action + 0.3 * total_penalty + esc_weight + perm_weight
470
 
471
  best_doc, best_doc_prior = None, -1.0
472
  for fn, recs in doc_groups.items():
 
484
 
485
  reordered = best_recs + other_recs
486
  top = reordered[:top_k]
 
487
  documents = [t[3] for t in top]
488
  metadatas = [t[4] for t in top]
489
  distances = [t[2] for t in top]
 
502
  "actions": actions,
503
  }
504
 
505
+ # ----------------------- Section fetch helpers -----------------------
506
  def get_section_text(filename: str, section: str) -> str:
 
507
  texts: List[str] = []
508
  for d in bm25_docs:
509
  m = d.get("meta", {})
 
514
  return "\n\n".join(texts).strip()
515
 
516
  def get_best_steps_section_text(filename: str) -> str:
 
517
  texts: List[str] = []
518
  for d in bm25_docs:
519
  m = d.get("meta", {})
 
523
  texts.append(t)
524
  return "\n\n".join(texts).strip()
525
 
526
+ def get_best_errors_section_text(filename: str) -> str:
527
+ """Return combined text of all error/permission/escalation chunks for the given SOP."""
528
+ texts: List[str] = []
529
+ for d in bm25_docs:
530
+ m = d.get("meta", {})
531
+ sec = (m.get("section") or "").lower()
532
+ topics = (m.get("topic_tags") or [])
533
+ if m.get("filename") == filename and (
534
+ m.get("intent_tag") == "errors"
535
+ or "error" in sec
536
+ or "escalation" in sec
537
+ or "permission" in sec
538
+ or "access" in sec
539
+ or ("permissions" in topics)
540
+ ):
541
+ t = (d.get("text") or "").strip()
542
+ if t:
543
+ texts.append(t)
544
+ return "\n\n".join(texts).strip()
545
+
546
+ # ----------------------- Admin helpers -----------------------
547
  def get_kb_runtime_info() -> Dict[str, Any]:
548
  return {
549
  "chroma_path": CHROMA_PATH,
 
573
  result["info"] = get_kb_runtime_info()
574
  return result
575
  except Exception as e:
576
+ return {"status": "ERROR", "error": f"{e}", "info": get_kb_runtime_info()}