srilakshu012456 commited on
Commit
2c3d060
·
verified ·
1 Parent(s): 2592788

Update services/kb_creation.py

Browse files
Files changed (1) hide show
  1. services/kb_creation.py +79 -40
services/kb_creation.py CHANGED
@@ -8,15 +8,15 @@ from docx import Document
8
  from sentence_transformers import SentenceTransformer
9
  import chromadb
10
 
11
- # ----------------------- ChromaDB setup -----------------------
12
  CHROMA_PATH = os.path.join(os.getcwd(), "chroma_db")
13
  client = chromadb.PersistentClient(path=CHROMA_PATH)
14
  collection = client.get_or_create_collection(name="knowledge_base")
15
 
16
- # ----------------------- Embedding model -----------------------
17
  model = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2")
18
 
19
- # ----------------------- BM25 (lightweight) -----------------------
20
  BM25_INDEX_FILE = os.path.join(CHROMA_PATH, "bm25_index.pkl")
21
  bm25_docs: List[Dict[str, Any]] = []
22
  bm25_inverted: Dict[str, List[int]] = {}
@@ -26,23 +26,25 @@ bm25_ready: bool = False
26
  BM25_K1 = 1.5
27
  BM25_B = 0.75
28
 
29
- # ----------------------- Utilities -----------------------
30
  def _tokenize(text: str) -> List[str]:
31
  if not text:
32
  return []
33
  text = text.lower()
34
  return re.findall(r"[a-z0-9]+", text)
35
 
 
36
  def _normalize_query(q: str) -> str:
37
  q = (q or "").strip().lower()
38
  q = re.sub(r"[^\w\s]", " ", q)
39
  q = re.sub(r"\s+", " ", q).strip()
40
  return q
41
 
 
42
  def _tokenize_meta_value(val: Optional[str]) -> List[str]:
43
  return _tokenize(val or "")
44
 
45
- # ----------------------- DOCX parsing & chunking -----------------------
46
  def _split_by_sections(doc: Document) -> List[Tuple[str, List[str]]]:
47
  sections: List[Tuple[str, List[str]]] = []
48
  current_title = None
@@ -66,6 +68,7 @@ def _split_by_sections(doc: Document) -> List[Tuple[str, List[str]]]:
66
  sections = [("Document", all_text)]
67
  return sections
68
 
 
69
  def _chunk_text_with_context(doc_title: str, section_title: str, paragraphs: List[str], max_words: int = 900) -> List[str]:
70
  body = "\n".join(paragraphs).strip()
71
  if not body:
@@ -80,9 +83,9 @@ def _chunk_text_with_context(doc_title: str, section_title: str, paragraphs: Lis
80
  chunks = [body]
81
  return chunks
82
 
83
- # ----------------------- Intent & Module tagging -----------------------
84
  SECTION_STEPS_HINTS = ["process steps", "procedure", "how to", "workflow", "instructions", "steps"]
85
- SECTION_ERRORS_HINTS = ["common errors", "resolution", "troubleshooting", "known issues", "escalation", "escalation path", "permissions", "access"]
86
 
87
  PERMISSION_TERMS = [
88
  "permission", "permissions", "access", "access right", "authorization", "authorisation",
@@ -90,7 +93,6 @@ PERMISSION_TERMS = [
90
  "not allowed", "not authorized", "denied", "restrict"
91
  ]
92
  ERROR_TERMS = ["error", "issue", "fail", "failure", "not working", "cannot", "can't"]
93
-
94
  STEP_VERBS = ["navigate", "select", "scan", "verify", "confirm", "print", "move", "complete", "click", "open", "choose", "enter", "update", "save", "delete", "create", "attach", "assign"]
95
 
96
  MODULE_VOCAB = {
@@ -102,6 +104,7 @@ MODULE_VOCAB = {
102
  "replenishment": ["replenishment", "replenish"],
103
  }
104
 
 
105
  def _infer_intent_tag(section_title: str) -> str:
106
  st = (section_title or "").lower()
107
  if any(k in st for k in SECTION_STEPS_HINTS):
@@ -114,6 +117,7 @@ def _infer_intent_tag(section_title: str) -> str:
114
  return "purpose"
115
  return "neutral"
116
 
 
117
  def _derive_semantic_intent_from_text(text: str) -> Tuple[str, List[str]]:
118
  t = (text or "").lower()
119
  tags: List[str] = []
@@ -121,8 +125,10 @@ def _derive_semantic_intent_from_text(text: str) -> Tuple[str, List[str]]:
121
  if any(term in t for term in PERMISSION_TERMS):
122
  intent = "errors"
123
  tags.append("permissions")
124
- if "role" in t: tags.append("role_access")
125
- if "security" in t: tags.append("security")
 
 
126
  if intent == "neutral" and any(term in t for term in ERROR_TERMS):
127
  intent = "errors"
128
  tags.append("errors")
@@ -131,6 +137,7 @@ def _derive_semantic_intent_from_text(text: str) -> Tuple[str, List[str]]:
131
  tags.append("procedure")
132
  return intent, list(set(tags))
133
 
 
134
  def _derive_module_tags(text: str, filename: str, section_title: str) -> List[str]:
135
  tokens = " ".join([filename or "", section_title or "", text or ""]).lower()
136
  found = []
@@ -142,7 +149,7 @@ def _derive_module_tags(text: str, filename: str, section_title: str) -> List[st
142
  found = ["inventory"]
143
  return list(sorted(set(found)))
144
 
145
- # ----------------------- Ingestion -----------------------
146
  def ingest_documents(folder_path: str) -> None:
147
  print(f"[KB] Checking folder: {folder_path}")
148
  files = [f for f in os.listdir(folder_path) if f.lower().endswith('.docx')]
@@ -150,19 +157,24 @@ def ingest_documents(folder_path: str) -> None:
150
  if not files:
151
  print("[KB] WARNING: No .docx files found. Please check the folder path.")
152
  return
 
153
  global bm25_docs, bm25_inverted, bm25_df, bm25_avgdl, bm25_ready
154
  bm25_docs, bm25_inverted, bm25_df = [], {}, {}
155
  bm25_avgdl, bm25_ready = 0.0, False
 
156
  for file in files:
157
  file_path = os.path.join(folder_path, file)
158
  doc_title = os.path.splitext(file)[0]
159
  doc = Document(file_path)
160
  sections = _split_by_sections(doc)
161
  total_chunks = 0
 
162
  for s_idx, (section_title, paras) in enumerate(sections):
163
  chunks = _chunk_text_with_context(doc_title, section_title, paras, max_words=900)
164
  total_chunks += len(chunks)
 
165
  base_intent = _infer_intent_tag(section_title)
 
166
  for c_idx, chunk in enumerate(chunks):
167
  derived_intent, topic_tags = _derive_semantic_intent_from_text(chunk)
168
  final_intent = base_intent
@@ -170,8 +182,8 @@ def ingest_documents(folder_path: str) -> None:
170
  final_intent = "errors"
171
  elif base_intent == "neutral" and derived_intent in ("steps", "prereqs"):
172
  final_intent = derived_intent
173
- module_tags = _derive_module_tags(chunk, file, section_title)
174
 
 
175
  embedding = model.encode(chunk).tolist()
176
  doc_id = f"{file}:{s_idx}:{c_idx}"
177
  meta = {
@@ -180,9 +192,9 @@ def ingest_documents(folder_path: str) -> None:
180
  "chunk_index": c_idx,
181
  "title": doc_title,
182
  "collection": "SOP",
183
- "intent_tag": final_intent, # str
184
- "topic_tags": ", ".join(topic_tags) if topic_tags else "", # str (NOT list)
185
- "module_tags": ", ".join(module_tags) if module_tags else "", # str (NOT list)
186
  }
187
  try:
188
  collection.add(ids=[doc_id], embeddings=[embedding], documents=[chunk], metadatas=[meta])
@@ -192,10 +204,12 @@ def ingest_documents(folder_path: str) -> None:
192
  collection.add(ids=[doc_id], embeddings=[embedding], documents=[chunk], metadatas=[meta])
193
  except Exception as e2:
194
  print(f"[KB] ERROR: Upsert failed for {doc_id}: {e2}")
 
195
  tokens = _tokenize(chunk)
196
  tf: Dict[str, int] = {}
197
  for tkn in tokens:
198
  tf[tkn] = tf.get(tkn, 0) + 1
 
199
  idx = len(bm25_docs)
200
  bm25_docs.append({
201
  "id": doc_id,
@@ -211,11 +225,14 @@ def ingest_documents(folder_path: str) -> None:
211
  if term not in seen:
212
  bm25_df[term] = bm25_df.get(term, 0) + 1
213
  seen.add(term)
 
214
  print(f"[KB] Ingested {file} → {total_chunks} chunks")
 
215
  N = len(bm25_docs)
216
  if N > 0:
217
  bm25_avgdl = sum(d["length"] for d in bm25_docs) / float(N)
218
  bm25_ready = True
 
219
  payload = {
220
  "bm25_docs": bm25_docs,
221
  "bm25_inverted": bm25_inverted,
@@ -230,7 +247,7 @@ def ingest_documents(folder_path: str) -> None:
230
  print(f"[KB] BM25 index saved: {BM25_INDEX_FILE}")
231
  print(f"[KB] Documents ingested. Total entries in Chroma: {collection.count()}")
232
 
233
- # ----------------------- BM25 load -----------------------
234
  def _load_bm25_index() -> None:
235
  global bm25_docs, bm25_inverted, bm25_df, bm25_avgdl, bm25_ready
236
  if not os.path.exists(BM25_INDEX_FILE):
@@ -248,9 +265,10 @@ def _load_bm25_index() -> None:
248
  except Exception as e:
249
  print(f"[KB] WARNING: Could not load BM25 index: {e}")
250
 
 
251
  _load_bm25_index()
252
 
253
- # ----------------------- BM25 search -----------------------
254
  def _bm25_score_for_doc(query_terms: List[str], doc_idx: int) -> float:
255
  if not bm25_ready or doc_idx < 0 or doc_idx >= len(bm25_docs):
256
  return 0.0
@@ -272,9 +290,10 @@ def _bm25_score_for_doc(query_terms: List[str], doc_idx: int) -> float:
272
  except Exception:
273
  idf = 1.0
274
  denom = tf + BM25_K1 * (1 - BM25_B + BM25_B * (dl / (bm25_avgdl or 1.0)))
275
- score += idf * ((tf * (BM25_K1 + 1)) / (denom or 1.0))
276
  return score
277
 
 
278
  def bm25_search(query: str, top_k: int = 50) -> List[Tuple[int, float]]:
279
  if not bm25_ready:
280
  return []
@@ -296,22 +315,19 @@ def bm25_search(query: str, top_k: int = 50) -> List[Tuple[int, float]]:
296
  scored.sort(key=lambda x: x[1], reverse=True)
297
  return scored[:top_k]
298
 
299
- # ----------------------- Semantic-only -----------------------
300
  def search_knowledge_base(query: str, top_k: int = 10) -> dict:
301
  query_embedding = model.encode(query).tolist()
302
  res = collection.query(
303
  query_embeddings=[query_embedding],
304
  n_results=top_k,
305
- include=['documents', 'metadatas', 'distances']
306
  )
307
- docs_ll = res.get("documents", [[ ]]) or [[ ]]
308
- metas_ll = res.get("metadatas", [[ ]]) or [[ ]]
309
- dists_ll = res.get("distances", [[ ]]) or [[ ]]
310
- ids_ll = res.get("ids", [[ ]]) or [[ ]]
311
- documents = docs_ll[0] if docs_ll else []
312
- metadatas = metas_ll[0] if metas_ll else []
313
- distances = dists_ll[0] if dists_ll else []
314
- ids = ids_ll[0] if ids_ll else []
315
  if not ids and documents:
316
  synthesized = []
317
  for i, m in enumerate(metadatas):
@@ -320,6 +336,7 @@ def search_knowledge_base(query: str, top_k: int = 10) -> dict:
320
  idx = (m or {}).get("chunk_index", i)
321
  synthesized.append(f"{fn}:{sec}:{idx}")
322
  ids = synthesized
 
323
  print(f"[KB] search → {len(documents)} docs (top_k={top_k}); first distance: {distances[0] if distances else 'n/a'}; ids={len(ids)}")
324
  return {
325
  "documents": documents,
@@ -328,7 +345,7 @@ def search_knowledge_base(query: str, top_k: int = 10) -> dict:
328
  "ids": ids,
329
  }
330
 
331
- # ----------------------- Hybrid search (intent + module + action) -----------------------
332
  ACTION_SYNONYMS = {
333
  "create": ["create", "creation", "add", "new", "generate"],
334
  "update": ["update", "modify", "change", "edit"],
@@ -342,6 +359,7 @@ ERROR_INTENT_TERMS = [
342
  "escalation", "escalation path", "access right"
343
  ]
344
 
 
345
  def _detect_user_intent(query: str) -> str:
346
  q = (query or "").lower()
347
  if any(k in q for k in ERROR_INTENT_TERMS):
@@ -354,6 +372,7 @@ def _detect_user_intent(query: str) -> str:
354
  return "purpose"
355
  return "neutral"
356
 
 
357
  def _extract_actions(query: str) -> List[str]:
358
  q = (query or "").lower()
359
  found = []
@@ -362,6 +381,7 @@ def _extract_actions(query: str) -> List[str]:
362
  found.append(act)
363
  return found or []
364
 
 
365
  def _extract_modules_from_query(query: str) -> List[str]:
366
  q = (query or "").lower()
367
  found = []
@@ -372,6 +392,7 @@ def _extract_modules_from_query(query: str) -> List[str]:
372
  found = ["inventory"]
373
  return list(sorted(set(found)))
374
 
 
375
  def _intent_weight(meta: dict, user_intent: str) -> float:
376
  tag = (meta or {}).get("intent_tag", "neutral")
377
  if user_intent == "neutral":
@@ -380,13 +401,15 @@ def _intent_weight(meta: dict, user_intent: str) -> float:
380
  return 1.0
381
  if tag in ["purpose", "prereqs"] and user_intent in ["steps", "errors"]:
382
  return -0.6
383
- st = (meta or {}).get("section", "").lower()
384
  topics = (meta or {}).get("topic_tags", "") or ""
385
  topic_list = [t.strip() for t in topics.split(",") if t.strip()]
386
- if user_intent == "errors" and (any(k in st for k in ["escalation", "permissions", "access", "known issues"]) or ("permissions" in topic_list)):
387
- return 0.7
 
388
  return -0.2
389
 
 
390
  def _module_weight(meta: Dict[str, Any], user_modules: List[str]) -> float:
391
  if not user_modules:
392
  return 0.0
@@ -394,9 +417,10 @@ def _module_weight(meta: Dict[str, Any], user_modules: List[str]) -> float:
394
  doc_modules = [m.strip() for m in raw.split(",") if m.strip()] if isinstance(raw, str) else (raw or [])
395
  overlap = len(set(user_modules) & set(doc_modules))
396
  if overlap == 0:
397
- return -0.4 # demote different modules to avoid wrong SOP
398
  return 0.6 * overlap
399
 
 
400
  def _meta_overlap(meta: Dict[str, Any], q_terms: List[str]) -> float:
401
  fn_tokens = _tokenize_meta_value(meta.get("filename"))
402
  title_tokens = _tokenize_meta_value(meta.get("title"))
@@ -410,6 +434,7 @@ def _meta_overlap(meta: Dict[str, Any], q_terms: List[str]) -> float:
410
  inter = len(meta_tokens & qset)
411
  return inter / max(1, len(qset))
412
 
 
413
  def _action_weight(text: str, actions: List[str]) -> float:
414
  if not actions:
415
  return 0.0
@@ -427,6 +452,7 @@ def _action_weight(text: str, actions: List[str]) -> float:
427
  score -= 0.8
428
  return score
429
 
 
430
  def hybrid_search_knowledge_base(query: str, top_k: int = 10, alpha: float = 0.6, beta: float = 0.4) -> dict:
431
  norm_query = _normalize_query(query)
432
  q_terms = _tokenize(norm_query)
@@ -460,12 +486,13 @@ def hybrid_search_knowledge_base(query: str, top_k: int = 10, alpha: float = 0.6
460
  bm25_id_to_text[d["id"]] = d["text"]
461
  bm25_id_to_meta[d["id"]] = d["meta"]
462
 
 
463
  union_ids = set(sem_ids) | set(bm25_id_to_norm.keys())
464
 
465
  gamma = 0.30 # meta overlap
466
  delta = 0.45 # intent boost (stronger for errors)
467
  epsilon = 0.30 # action weight
468
- zeta = 0.50 # module weight (new)
469
 
470
  combined_records_ext: List[Tuple[str, float, float, str, Dict[str, Any], float, float, float, float]] = []
471
  for cid in union_ids:
@@ -477,15 +504,19 @@ def hybrid_search_knowledge_base(query: str, top_k: int = 10, alpha: float = 0.6
477
  sem_meta = sem_metas[pos] if pos < len(sem_metas) else {}
478
  else:
479
  sem_sim, sem_dist, sem_text, sem_meta = 0.0, None, "", {}
 
480
  bm25_sim = bm25_id_to_norm.get(cid, 0.0)
481
  bm25_text = bm25_id_to_text.get(cid, "")
482
  bm25_meta = bm25_id_to_meta.get(cid, {})
 
483
  text = sem_text if sem_text else bm25_text
484
  meta = sem_meta if sem_meta else bm25_meta
 
485
  m_overlap = _meta_overlap(meta, q_terms)
486
  intent_boost = _intent_weight(meta, user_intent)
487
  act_wt = _action_weight(text, actions)
488
  mod_wt = _module_weight(meta, user_modules)
 
489
  final_score = alpha * sem_sim + beta * bm25_sim + gamma * m_overlap + delta * intent_boost + epsilon * act_wt + zeta * mod_wt
490
  combined_records_ext.append(
491
  (cid, final_score, (sem_dist if sem_dist is not None else 999.0), text, meta, m_overlap, intent_boost, act_wt, mod_wt)
@@ -505,8 +536,8 @@ def hybrid_search_knowledge_base(query: str, top_k: int = 10, alpha: float = 0.6
505
  total_action = sum(max(0.0, r[7]) for r in recs)
506
  total_module = sum(r[8] for r in recs)
507
  total_penalty = sum(min(0.0, r[6]) for r in recs) + sum(min(0.0, r[7]) for r in recs)
508
- esc_weight = 0.3 if any("escalation" in (r[4] or {}).get("section", "").lower() for r in recs) else 0.0
509
- perm_weight = 0.3 if any("permissions" in ((r[4] or {}).get("topic_tags") or []) for r in recs) else 0.0
510
  return total_score + 0.4 * total_overlap + 0.7 * total_intent + 0.5 * total_action + 0.6 * total_module + 0.3 * total_penalty + esc_weight + perm_weight
511
 
512
  best_doc, best_doc_prior = None, -1.0
@@ -516,7 +547,7 @@ def hybrid_search_knowledge_base(query: str, top_k: int = 10, alpha: float = 0.6
516
  best_doc_prior, best_doc = p, fn
517
 
518
  best_recs = sorted(doc_groups.get(best_doc, []), key=lambda x: x[1], reverse=True)
519
- other_recs = []
520
  for fn, recs in doc_groups.items():
521
  if fn == best_doc:
522
  continue
@@ -525,6 +556,7 @@ def hybrid_search_knowledge_base(query: str, top_k: int = 10, alpha: float = 0.6
525
 
526
  reordered = best_recs + other_recs
527
  top = reordered[:top_k]
 
528
  documents = [t[3] for t in top]
529
  metadatas = [t[4] for t in top]
530
  distances = [t[2] for t in top]
@@ -543,7 +575,7 @@ def hybrid_search_knowledge_base(query: str, top_k: int = 10, alpha: float = 0.6
543
  "actions": actions,
544
  }
545
 
546
- # ----------------------- Section fetch helpers -----------------------
547
  def get_section_text(filename: str, section: str) -> str:
548
  texts: List[str] = []
549
  for d in bm25_docs:
@@ -554,6 +586,7 @@ def get_section_text(filename: str, section: str) -> str:
554
  texts.append(t)
555
  return "\n\n".join(texts).strip()
556
 
 
557
  def get_best_steps_section_text(filename: str) -> str:
558
  texts: List[str] = []
559
  for d in bm25_docs:
@@ -564,6 +597,7 @@ def get_best_steps_section_text(filename: str) -> str:
564
  texts.append(t)
565
  return "\n\n".join(texts).strip()
566
 
 
567
  def get_best_errors_section_text(filename: str) -> str:
568
  texts: List[str] = []
569
  for d in bm25_docs:
@@ -577,6 +611,8 @@ def get_best_errors_section_text(filename: str) -> str:
577
  or "escalation" in sec
578
  or "permission" in sec
579
  or "access" in sec
 
 
580
  or ("permissions" in topic_list)
581
  ):
582
  t = (d.get("text") or "").strip()
@@ -584,7 +620,7 @@ def get_best_errors_section_text(filename: str) -> str:
584
  texts.append(t)
585
  return "\n\n".join(texts).strip()
586
 
587
- # ----------------------- Admin helpers -----------------------
588
  def get_kb_runtime_info() -> Dict[str, Any]:
589
  return {
590
  "chroma_path": CHROMA_PATH,
@@ -595,6 +631,7 @@ def get_kb_runtime_info() -> Dict[str, Any]:
595
  "bm25_ready": bm25_ready,
596
  }
597
 
 
598
  def reset_kb(folder_path: str) -> Dict[str, Any]:
599
  result = {"status": "OK", "message": "KB reset and re-ingested"}
600
  try:
@@ -604,14 +641,16 @@ def reset_kb(folder_path: str) -> Dict[str, Any]:
604
  pass
605
  global collection
606
  collection = client.get_or_create_collection(name="knowledge_base")
 
607
  try:
608
  if os.path.isfile(BM25_INDEX_FILE):
609
  os.remove(BM25_INDEX_FILE)
610
  except Exception as e:
611
  result.setdefault("warnings", []).append(f"bm25 index delete: {e}")
 
612
  os.makedirs(CHROMA_PATH, exist_ok=True)
613
  ingest_documents(folder_path)
614
  result["info"] = get_kb_runtime_info()
615
  return result
616
  except Exception as e:
617
- return {"status": "ERROR", "error": f"{e}", "info": get_kb_runtime_info()}
 
8
  from sentence_transformers import SentenceTransformer
9
  import chromadb
10
 
11
+ # ---------------------------- ChromaDB setup ----------------------------
12
  CHROMA_PATH = os.path.join(os.getcwd(), "chroma_db")
13
  client = chromadb.PersistentClient(path=CHROMA_PATH)
14
  collection = client.get_or_create_collection(name="knowledge_base")
15
 
16
+ # ---------------------------- Embedding model ----------------------------
17
  model = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2")
18
 
19
+ # ---------------------------- BM25 (lightweight) ----------------------------
20
  BM25_INDEX_FILE = os.path.join(CHROMA_PATH, "bm25_index.pkl")
21
  bm25_docs: List[Dict[str, Any]] = []
22
  bm25_inverted: Dict[str, List[int]] = {}
 
26
  BM25_K1 = 1.5
27
  BM25_B = 0.75
28
 
29
+ # ---------------------------- Utilities ----------------------------
30
  def _tokenize(text: str) -> List[str]:
31
  if not text:
32
  return []
33
  text = text.lower()
34
  return re.findall(r"[a-z0-9]+", text)
35
 
36
+
37
  def _normalize_query(q: str) -> str:
38
  q = (q or "").strip().lower()
39
  q = re.sub(r"[^\w\s]", " ", q)
40
  q = re.sub(r"\s+", " ", q).strip()
41
  return q
42
 
43
+
44
  def _tokenize_meta_value(val: Optional[str]) -> List[str]:
45
  return _tokenize(val or "")
46
 
47
+ # ---------------------------- DOCX parsing & chunking ----------------------------
48
  def _split_by_sections(doc: Document) -> List[Tuple[str, List[str]]]:
49
  sections: List[Tuple[str, List[str]]] = []
50
  current_title = None
 
68
  sections = [("Document", all_text)]
69
  return sections
70
 
71
+
72
  def _chunk_text_with_context(doc_title: str, section_title: str, paragraphs: List[str], max_words: int = 900) -> List[str]:
73
  body = "\n".join(paragraphs).strip()
74
  if not body:
 
83
  chunks = [body]
84
  return chunks
85
 
86
+ # ---------------------------- Intent & Module tagging ----------------------------
87
  SECTION_STEPS_HINTS = ["process steps", "procedure", "how to", "workflow", "instructions", "steps"]
88
+ SECTION_ERRORS_HINTS = ["common errors", "resolution", "troubleshooting", "known issues", "common issues", "escalation", "escalation path", "permissions", "access"]
89
 
90
  PERMISSION_TERMS = [
91
  "permission", "permissions", "access", "access right", "authorization", "authorisation",
 
93
  "not allowed", "not authorized", "denied", "restrict"
94
  ]
95
  ERROR_TERMS = ["error", "issue", "fail", "failure", "not working", "cannot", "can't"]
 
96
  STEP_VERBS = ["navigate", "select", "scan", "verify", "confirm", "print", "move", "complete", "click", "open", "choose", "enter", "update", "save", "delete", "create", "attach", "assign"]
97
 
98
  MODULE_VOCAB = {
 
104
  "replenishment": ["replenishment", "replenish"],
105
  }
106
 
107
+
108
  def _infer_intent_tag(section_title: str) -> str:
109
  st = (section_title or "").lower()
110
  if any(k in st for k in SECTION_STEPS_HINTS):
 
117
  return "purpose"
118
  return "neutral"
119
 
120
+
121
  def _derive_semantic_intent_from_text(text: str) -> Tuple[str, List[str]]:
122
  t = (text or "").lower()
123
  tags: List[str] = []
 
125
  if any(term in t for term in PERMISSION_TERMS):
126
  intent = "errors"
127
  tags.append("permissions")
128
+ if "role" in t:
129
+ tags.append("role_access")
130
+ if "security" in t:
131
+ tags.append("security")
132
  if intent == "neutral" and any(term in t for term in ERROR_TERMS):
133
  intent = "errors"
134
  tags.append("errors")
 
137
  tags.append("procedure")
138
  return intent, list(set(tags))
139
 
140
+
141
  def _derive_module_tags(text: str, filename: str, section_title: str) -> List[str]:
142
  tokens = " ".join([filename or "", section_title or "", text or ""]).lower()
143
  found = []
 
149
  found = ["inventory"]
150
  return list(sorted(set(found)))
151
 
152
+ # ---------------------------- Ingestion ----------------------------
153
  def ingest_documents(folder_path: str) -> None:
154
  print(f"[KB] Checking folder: {folder_path}")
155
  files = [f for f in os.listdir(folder_path) if f.lower().endswith('.docx')]
 
157
  if not files:
158
  print("[KB] WARNING: No .docx files found. Please check the folder path.")
159
  return
160
+
161
  global bm25_docs, bm25_inverted, bm25_df, bm25_avgdl, bm25_ready
162
  bm25_docs, bm25_inverted, bm25_df = [], {}, {}
163
  bm25_avgdl, bm25_ready = 0.0, False
164
+
165
  for file in files:
166
  file_path = os.path.join(folder_path, file)
167
  doc_title = os.path.splitext(file)[0]
168
  doc = Document(file_path)
169
  sections = _split_by_sections(doc)
170
  total_chunks = 0
171
+
172
  for s_idx, (section_title, paras) in enumerate(sections):
173
  chunks = _chunk_text_with_context(doc_title, section_title, paras, max_words=900)
174
  total_chunks += len(chunks)
175
+
176
  base_intent = _infer_intent_tag(section_title)
177
+
178
  for c_idx, chunk in enumerate(chunks):
179
  derived_intent, topic_tags = _derive_semantic_intent_from_text(chunk)
180
  final_intent = base_intent
 
182
  final_intent = "errors"
183
  elif base_intent == "neutral" and derived_intent in ("steps", "prereqs"):
184
  final_intent = derived_intent
 
185
 
186
+ module_tags = _derive_module_tags(chunk, file, section_title)
187
  embedding = model.encode(chunk).tolist()
188
  doc_id = f"{file}:{s_idx}:{c_idx}"
189
  meta = {
 
192
  "chunk_index": c_idx,
193
  "title": doc_title,
194
  "collection": "SOP",
195
+ "intent_tag": final_intent, # str
196
+ "topic_tags": ", ".join(topic_tags) if topic_tags else "", # str (NOT list)
197
+ "module_tags": ", ".join(module_tags) if module_tags else "", # str (NOT list)
198
  }
199
  try:
200
  collection.add(ids=[doc_id], embeddings=[embedding], documents=[chunk], metadatas=[meta])
 
204
  collection.add(ids=[doc_id], embeddings=[embedding], documents=[chunk], metadatas=[meta])
205
  except Exception as e2:
206
  print(f"[KB] ERROR: Upsert failed for {doc_id}: {e2}")
207
+
208
  tokens = _tokenize(chunk)
209
  tf: Dict[str, int] = {}
210
  for tkn in tokens:
211
  tf[tkn] = tf.get(tkn, 0) + 1
212
+
213
  idx = len(bm25_docs)
214
  bm25_docs.append({
215
  "id": doc_id,
 
225
  if term not in seen:
226
  bm25_df[term] = bm25_df.get(term, 0) + 1
227
  seen.add(term)
228
+
229
  print(f"[KB] Ingested {file} → {total_chunks} chunks")
230
+
231
  N = len(bm25_docs)
232
  if N > 0:
233
  bm25_avgdl = sum(d["length"] for d in bm25_docs) / float(N)
234
  bm25_ready = True
235
+
236
  payload = {
237
  "bm25_docs": bm25_docs,
238
  "bm25_inverted": bm25_inverted,
 
247
  print(f"[KB] BM25 index saved: {BM25_INDEX_FILE}")
248
  print(f"[KB] Documents ingested. Total entries in Chroma: {collection.count()}")
249
 
250
+ # ---------------------------- BM25 load ----------------------------
251
  def _load_bm25_index() -> None:
252
  global bm25_docs, bm25_inverted, bm25_df, bm25_avgdl, bm25_ready
253
  if not os.path.exists(BM25_INDEX_FILE):
 
265
  except Exception as e:
266
  print(f"[KB] WARNING: Could not load BM25 index: {e}")
267
 
268
+
269
  _load_bm25_index()
270
 
271
+ # ---------------------------- BM25 search ----------------------------
272
  def _bm25_score_for_doc(query_terms: List[str], doc_idx: int) -> float:
273
  if not bm25_ready or doc_idx < 0 or doc_idx >= len(bm25_docs):
274
  return 0.0
 
290
  except Exception:
291
  idf = 1.0
292
  denom = tf + BM25_K1 * (1 - BM25_B + BM25_B * (dl / (bm25_avgdl or 1.0)))
293
+ score += idf * (((tf * (BM25_K1 + 1)) / (denom or 1.0)))
294
  return score
295
 
296
+
297
  def bm25_search(query: str, top_k: int = 50) -> List[Tuple[int, float]]:
298
  if not bm25_ready:
299
  return []
 
315
  scored.sort(key=lambda x: x[1], reverse=True)
316
  return scored[:top_k]
317
 
318
+ # ---------------------------- Semantic-only ----------------------------
319
  def search_knowledge_base(query: str, top_k: int = 10) -> dict:
320
  query_embedding = model.encode(query).tolist()
321
  res = collection.query(
322
  query_embeddings=[query_embedding],
323
  n_results=top_k,
324
+ include=['documents', 'metadatas', 'distances', 'ids']
325
  )
326
+ documents = (res.get("documents", [[]]) or [[]])[0]
327
+ metadatas = (res.get("metadatas", [[]]) or [[]])[0]
328
+ distances = (res.get("distances", [[]]) or [[]])[0]
329
+ ids = (res.get("ids", [[]]) or [[]])[0]
330
+
 
 
 
331
  if not ids and documents:
332
  synthesized = []
333
  for i, m in enumerate(metadatas):
 
336
  idx = (m or {}).get("chunk_index", i)
337
  synthesized.append(f"{fn}:{sec}:{idx}")
338
  ids = synthesized
339
+
340
  print(f"[KB] search → {len(documents)} docs (top_k={top_k}); first distance: {distances[0] if distances else 'n/a'}; ids={len(ids)}")
341
  return {
342
  "documents": documents,
 
345
  "ids": ids,
346
  }
347
 
348
+ # ---------------------------- Hybrid search (intent + module + action) ----------------------------
349
  ACTION_SYNONYMS = {
350
  "create": ["create", "creation", "add", "new", "generate"],
351
  "update": ["update", "modify", "change", "edit"],
 
359
  "escalation", "escalation path", "access right"
360
  ]
361
 
362
+
363
  def _detect_user_intent(query: str) -> str:
364
  q = (query or "").lower()
365
  if any(k in q for k in ERROR_INTENT_TERMS):
 
372
  return "purpose"
373
  return "neutral"
374
 
375
+
376
  def _extract_actions(query: str) -> List[str]:
377
  q = (query or "").lower()
378
  found = []
 
381
  found.append(act)
382
  return found or []
383
 
384
+
385
  def _extract_modules_from_query(query: str) -> List[str]:
386
  q = (query or "").lower()
387
  found = []
 
392
  found = ["inventory"]
393
  return list(sorted(set(found)))
394
 
395
+
396
  def _intent_weight(meta: dict, user_intent: str) -> float:
397
  tag = (meta or {}).get("intent_tag", "neutral")
398
  if user_intent == "neutral":
 
401
  return 1.0
402
  if tag in ["purpose", "prereqs"] and user_intent in ["steps", "errors"]:
403
  return -0.6
404
+ st = ((meta or {}).get("section", "") or "").lower()
405
  topics = (meta or {}).get("topic_tags", "") or ""
406
  topic_list = [t.strip() for t in topics.split(",") if t.strip()]
407
+ # Strongly prefer errors/escalation/permissions when the user intent is errors
408
+ if user_intent == "errors" and (any(k in st for k in ["escalation", "permissions", "access", "known issues", "common issues"]) or ("permissions" in topic_list)):
409
+ return 0.95
410
  return -0.2
411
 
412
+
413
  def _module_weight(meta: Dict[str, Any], user_modules: List[str]) -> float:
414
  if not user_modules:
415
  return 0.0
 
417
  doc_modules = [m.strip() for m in raw.split(",") if m.strip()] if isinstance(raw, str) else (raw or [])
418
  overlap = len(set(user_modules) & set(doc_modules))
419
  if overlap == 0:
420
+ return -0.4 # demote different modules to avoid wrong SOP
421
  return 0.6 * overlap
422
 
423
+
424
  def _meta_overlap(meta: Dict[str, Any], q_terms: List[str]) -> float:
425
  fn_tokens = _tokenize_meta_value(meta.get("filename"))
426
  title_tokens = _tokenize_meta_value(meta.get("title"))
 
434
  inter = len(meta_tokens & qset)
435
  return inter / max(1, len(qset))
436
 
437
+
438
  def _action_weight(text: str, actions: List[str]) -> float:
439
  if not actions:
440
  return 0.0
 
452
  score -= 0.8
453
  return score
454
 
455
+
456
  def hybrid_search_knowledge_base(query: str, top_k: int = 10, alpha: float = 0.6, beta: float = 0.4) -> dict:
457
  norm_query = _normalize_query(query)
458
  q_terms = _tokenize(norm_query)
 
486
  bm25_id_to_text[d["id"]] = d["text"]
487
  bm25_id_to_meta[d["id"]] = d["meta"]
488
 
489
+ # Union of IDs from semantic and BM25
490
  union_ids = set(sem_ids) | set(bm25_id_to_norm.keys())
491
 
492
  gamma = 0.30 # meta overlap
493
  delta = 0.45 # intent boost (stronger for errors)
494
  epsilon = 0.30 # action weight
495
+ zeta = 0.50 # module weight (new)
496
 
497
  combined_records_ext: List[Tuple[str, float, float, str, Dict[str, Any], float, float, float, float]] = []
498
  for cid in union_ids:
 
504
  sem_meta = sem_metas[pos] if pos < len(sem_metas) else {}
505
  else:
506
  sem_sim, sem_dist, sem_text, sem_meta = 0.0, None, "", {}
507
+
508
  bm25_sim = bm25_id_to_norm.get(cid, 0.0)
509
  bm25_text = bm25_id_to_text.get(cid, "")
510
  bm25_meta = bm25_id_to_meta.get(cid, {})
511
+
512
  text = sem_text if sem_text else bm25_text
513
  meta = sem_meta if sem_meta else bm25_meta
514
+
515
  m_overlap = _meta_overlap(meta, q_terms)
516
  intent_boost = _intent_weight(meta, user_intent)
517
  act_wt = _action_weight(text, actions)
518
  mod_wt = _module_weight(meta, user_modules)
519
+
520
  final_score = alpha * sem_sim + beta * bm25_sim + gamma * m_overlap + delta * intent_boost + epsilon * act_wt + zeta * mod_wt
521
  combined_records_ext.append(
522
  (cid, final_score, (sem_dist if sem_dist is not None else 999.0), text, meta, m_overlap, intent_boost, act_wt, mod_wt)
 
536
  total_action = sum(max(0.0, r[7]) for r in recs)
537
  total_module = sum(r[8] for r in recs)
538
  total_penalty = sum(min(0.0, r[6]) for r in recs) + sum(min(0.0, r[7]) for r in recs)
539
+ esc_weight = 0.3 if any("escalation" in ((r[4] or {}).get("section", "")).lower() for r in recs) else 0.0
540
+ perm_weight = 0.3 if any("permissions" in (((r[4] or {}).get("topic_tags") or [])) for r in recs) else 0.0
541
  return total_score + 0.4 * total_overlap + 0.7 * total_intent + 0.5 * total_action + 0.6 * total_module + 0.3 * total_penalty + esc_weight + perm_weight
542
 
543
  best_doc, best_doc_prior = None, -1.0
 
547
  best_doc_prior, best_doc = p, fn
548
 
549
  best_recs = sorted(doc_groups.get(best_doc, []), key=lambda x: x[1], reverse=True)
550
+ other_recs: List[Tuple[str, float, float, str, Dict[str, Any], float, float, float, float]] = []
551
  for fn, recs in doc_groups.items():
552
  if fn == best_doc:
553
  continue
 
556
 
557
  reordered = best_recs + other_recs
558
  top = reordered[:top_k]
559
+
560
  documents = [t[3] for t in top]
561
  metadatas = [t[4] for t in top]
562
  distances = [t[2] for t in top]
 
575
  "actions": actions,
576
  }
577
 
578
+ # ---------------------------- Section fetch helpers ----------------------------
579
  def get_section_text(filename: str, section: str) -> str:
580
  texts: List[str] = []
581
  for d in bm25_docs:
 
586
  texts.append(t)
587
  return "\n\n".join(texts).strip()
588
 
589
+
590
  def get_best_steps_section_text(filename: str) -> str:
591
  texts: List[str] = []
592
  for d in bm25_docs:
 
597
  texts.append(t)
598
  return "\n\n".join(texts).strip()
599
 
600
+
601
  def get_best_errors_section_text(filename: str) -> str:
602
  texts: List[str] = []
603
  for d in bm25_docs:
 
611
  or "escalation" in sec
612
  or "permission" in sec
613
  or "access" in sec
614
+ or "known issues" in sec
615
+ or "common issues" in sec
616
  or ("permissions" in topic_list)
617
  ):
618
  t = (d.get("text") or "").strip()
 
620
  texts.append(t)
621
  return "\n\n".join(texts).strip()
622
 
623
+ # ---------------------------- Admin helpers ----------------------------
624
  def get_kb_runtime_info() -> Dict[str, Any]:
625
  return {
626
  "chroma_path": CHROMA_PATH,
 
631
  "bm25_ready": bm25_ready,
632
  }
633
 
634
+
635
  def reset_kb(folder_path: str) -> Dict[str, Any]:
636
  result = {"status": "OK", "message": "KB reset and re-ingested"}
637
  try:
 
641
  pass
642
  global collection
643
  collection = client.get_or_create_collection(name="knowledge_base")
644
+
645
  try:
646
  if os.path.isfile(BM25_INDEX_FILE):
647
  os.remove(BM25_INDEX_FILE)
648
  except Exception as e:
649
  result.setdefault("warnings", []).append(f"bm25 index delete: {e}")
650
+
651
  os.makedirs(CHROMA_PATH, exist_ok=True)
652
  ingest_documents(folder_path)
653
  result["info"] = get_kb_runtime_info()
654
  return result
655
  except Exception as e:
656
+ return {"status": "ERROR", "error": f"{e}", "info": get_kb_runtime_info()}