srilakshu012456 commited on
Commit
c3ee443
·
verified ·
1 Parent(s): a27045c

Update services/kb_creation.py

Browse files
Files changed (1) hide show
  1. services/kb_creation.py +147 -97
services/kb_creation.py CHANGED
@@ -1,4 +1,3 @@
1
-
2
  # services/kb_creation.py
3
  import os
4
  import re
@@ -8,12 +7,15 @@ from docx import Document
8
  from sentence_transformers import SentenceTransformer
9
  import chromadb
10
 
 
11
  CHROMA_PATH = os.path.join(os.getcwd(), "chroma_db")
12
  client = chromadb.PersistentClient(path=CHROMA_PATH)
13
  collection = client.get_or_create_collection(name="knowledge_base")
14
 
 
15
  model = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2")
16
 
 
17
  BM25_INDEX_FILE = os.path.join(CHROMA_PATH, "bm25_index.pkl")
18
  bm25_docs: List[Dict[str, Any]] = []
19
  bm25_inverted: Dict[str, List[int]] = {}
@@ -23,6 +25,7 @@ bm25_ready: bool = False
23
  BM25_K1 = 1.5
24
  BM25_B = 0.75
25
 
 
26
  def _tokenize(text: str) -> List[str]:
27
  if not text:
28
  return []
@@ -38,7 +41,8 @@ def _normalize_query(q: str) -> str:
38
  def _tokenize_meta_value(val: Optional[str]) -> List[str]:
39
  return _tokenize(val or "")
40
 
41
- BULLET_RE = re.compile(r"^\s*(?:[\-\*•]|\d+[.)])\s+", re.IGNORECASE)
 
42
 
43
  def _split_by_sections(doc: Document) -> List[Tuple[str, List[str]]]:
44
  sections: List[Tuple[str, List[str]]] = []
@@ -64,6 +68,11 @@ def _split_by_sections(doc: Document) -> List[Tuple[str, List[str]]]:
64
  return sections
65
 
66
  def _paragraphs_to_lines(paragraphs: List[str]) -> List[str]:
 
 
 
 
 
67
  lines: List[str] = []
68
  for p in (paragraphs or []):
69
  p = (p or "").strip()
@@ -72,15 +81,19 @@ def _paragraphs_to_lines(paragraphs: List[str]) -> List[str]:
72
  if BULLET_RE.match(p):
73
  lines.append(p)
74
  continue
75
- parts = [s.strip() for s in re.split(r"(?<=[.!?])\s+|;\s+|\s*→\s*|\s*->\s*|\s*then\s*", p) if s.strip()]
76
  lines.extend(parts)
77
  return lines
78
 
79
  def _chunk_text_with_context(doc_title: str, section_title: str, paragraphs: List[str], max_words: int = 160) -> List[str]:
 
 
 
80
  lines = _paragraphs_to_lines(paragraphs)
81
  chunks: List[str] = []
82
  current: List[str] = []
83
  current_len = 0
 
84
  for ln in lines:
85
  w = ln.split()
86
  if current_len + len(w) > max_words or (BULLET_RE.match(ln) and current):
@@ -92,18 +105,22 @@ def _chunk_text_with_context(doc_title: str, section_title: str, paragraphs: Lis
92
  else:
93
  current.append(ln)
94
  current_len += len(w)
 
95
  if current:
96
  chunk = " ".join(current).strip()
97
  if chunk:
98
  chunks.append(chunk)
 
99
  if not chunks:
100
  body = " ".join(lines).strip()
101
  if body:
102
  chunks = [body]
103
  return chunks
104
 
 
105
  SECTION_STEPS_HINTS = ["process steps", "procedure", "how to", "workflow", "instructions", "steps"]
106
  SECTION_ERRORS_HINTS = ["common errors", "resolution", "troubleshooting", "known issues", "common issues", "escalation", "escalation path", "permissions", "access"]
 
107
  PERMISSION_TERMS = [
108
  "permission", "permissions", "access", "access right", "authorization", "authorisation",
109
  "role", "role access", "role mapping", "security", "security profile", "privilege", "insufficient",
@@ -111,9 +128,16 @@ PERMISSION_TERMS = [
111
  ]
112
  ERROR_TERMS = ["error", "issue", "fail", "failure", "not working", "cannot", "can't", "mismatch", "locked", "wrong", "denied"]
113
  STEP_VERBS = ["navigate", "select", "scan", "verify", "confirm", "print", "move", "complete", "click", "open", "choose", "enter", "update", "save", "delete", "create", "attach", "assign"]
 
114
  MODULE_VOCAB = {
115
- "receiving": ["receive", "receiving", "inbound receiving", "inbound", "goods receipt", "grn", "asn receiving", "unload", "check-in", "dock check-in"],
116
- "appointments": ["appointment", "appointments", "schedule", "scheduling", "slot", "dock door", "appointment creation", "appointment details"],
 
 
 
 
 
 
117
  "picking": ["pick", "picking", "pick release", "wave", "allocation"],
118
  "putaway": ["putaway", "staging", "put away", "location assignment"],
119
  "shipping": ["shipping", "ship confirm", "outbound", "load", "trailer"],
@@ -171,24 +195,7 @@ def _derive_module_tags(text: str, filename: str, section_title: str) -> List[st
171
  found = ["appointments"]
172
  return list(sorted(set(found)))
173
 
174
- ACTION_SYNONYMS = {
175
- "create": ["create", "creation", "add", "new", "generate", "book", "schedule"],
176
- "update": ["update", "modify", "change", "edit", "amend", "reschedule", "re-schedule"],
177
- "delete": ["delete", "remove"],
178
- "navigate": ["navigate", "go to", "open"],
179
- }
180
-
181
- def _derive_action_tags(text: str) -> List[str]:
182
- t = (text or "").lower()
183
- tags = []
184
- for act, syns in ACTION_SYNONYMS.items():
185
- if any(s in t for s in syns):
186
- tags.append(act)
187
- if "reschedule" in t or "re-schedule" in t:
188
- if "update" not in tags:
189
- tags.append("update")
190
- return sorted(set(tags))
191
-
192
  def ingest_documents(folder_path: str) -> None:
193
  print(f"[KB] Checking folder: {folder_path}")
194
  files = [f for f in os.listdir(folder_path) if f.lower().endswith('.docx')]
@@ -196,19 +203,24 @@ def ingest_documents(folder_path: str) -> None:
196
  if not files:
197
  print("[KB] WARNING: No .docx files found. Please check the folder path.")
198
  return
 
199
  global bm25_docs, bm25_inverted, bm25_df, bm25_avgdl, bm25_ready
200
  bm25_docs, bm25_inverted, bm25_df = [], {}, {}
201
  bm25_avgdl, bm25_ready = 0.0, False
 
202
  for file in files:
203
  file_path = os.path.join(folder_path, file)
204
  doc_title = os.path.splitext(file)[0]
205
  doc = Document(file_path)
206
  sections = _split_by_sections(doc)
207
  total_chunks = 0
 
208
  for s_idx, (section_title, paras) in enumerate(sections):
209
  chunks = _chunk_text_with_context(doc_title, section_title, paras, max_words=160)
210
  total_chunks += len(chunks)
 
211
  base_intent = _infer_intent_tag(section_title)
 
212
  for c_idx, chunk in enumerate(chunks):
213
  derived_intent, topic_tags = _derive_semantic_intent_from_text(chunk)
214
  final_intent = base_intent
@@ -216,8 +228,8 @@ def ingest_documents(folder_path: str) -> None:
216
  final_intent = "errors"
217
  elif base_intent == "neutral" and derived_intent in ("steps", "prereqs"):
218
  final_intent = derived_intent
 
219
  module_tags = _derive_module_tags(chunk, file, section_title)
220
- action_tags = _derive_action_tags(chunk)
221
  embedding = model.encode(chunk).tolist()
222
  doc_id = f"{file}:{s_idx}:{c_idx}"
223
  meta = {
@@ -229,7 +241,6 @@ def ingest_documents(folder_path: str) -> None:
229
  "intent_tag": final_intent,
230
  "topic_tags": ", ".join(topic_tags) if topic_tags else "",
231
  "module_tags": ", ".join(module_tags) if module_tags else "",
232
- "action_tags": ", ".join(action_tags) if action_tags else "",
233
  }
234
  try:
235
  collection.add(ids=[doc_id], embeddings=[embedding], documents=[chunk], metadatas=[meta])
@@ -239,10 +250,12 @@ def ingest_documents(folder_path: str) -> None:
239
  collection.add(ids=[doc_id], embeddings=[embedding], documents=[chunk], metadatas=[meta])
240
  except Exception as e2:
241
  print(f"[KB] ERROR: Upsert failed for {doc_id}: {e2}")
 
242
  tokens = _tokenize(chunk)
243
  tf: Dict[str, int] = {}
244
  for tkn in tokens:
245
  tf[tkn] = tf.get(tkn, 0) + 1
 
246
  idx = len(bm25_docs)
247
  bm25_docs.append({
248
  "id": doc_id,
@@ -258,11 +271,14 @@ def ingest_documents(folder_path: str) -> None:
258
  if term not in seen:
259
  bm25_df[term] = bm25_df.get(term, 0) + 1
260
  seen.add(term)
 
261
  print(f"[KB] Ingested {file} → {total_chunks} chunks")
 
262
  N = len(bm25_docs)
263
  if N > 0:
264
  bm25_avgdl = sum(d["length"] for d in bm25_docs) / float(N)
265
  bm25_ready = True
 
266
  payload = {
267
  "bm25_docs": bm25_docs,
268
  "bm25_inverted": bm25_inverted,
@@ -277,7 +293,7 @@ def ingest_documents(folder_path: str) -> None:
277
  print(f"[KB] BM25 index saved: {BM25_INDEX_FILE}")
278
  print(f"[KB] Documents ingested. Total entries in Chroma: {collection.count()}")
279
 
280
-
281
  def _load_bm25_index() -> None:
282
  global bm25_docs, bm25_inverted, bm25_df, bm25_avgdl, bm25_ready
283
  if not os.path.exists(BM25_INDEX_FILE):
@@ -297,7 +313,7 @@ def _load_bm25_index() -> None:
297
 
298
  _load_bm25_index()
299
 
300
-
301
  def _bm25_score_for_doc(query_terms: List[str], doc_idx: int) -> float:
302
  if not bm25_ready or doc_idx < 0 or doc_idx >= len(bm25_docs):
303
  return 0.0
@@ -312,9 +328,10 @@ def _bm25_score_for_doc(query_terms: List[str], doc_idx: int) -> float:
312
  if tf == 0:
313
  continue
314
  N = len(bm25_docs)
 
315
  try:
316
  import math
317
- idf = math.log(((N - df + 0.5) / (df + 0.5)) + 1.0)
318
  except Exception:
319
  idf = 1.0
320
  denom = tf + BM25_K1 * (1 - BM25_B + BM25_B * (dl / (bm25_avgdl or 1.0)))
@@ -342,13 +359,19 @@ def bm25_search(query: str, top_k: int = 50) -> List[Tuple[int, float]]:
342
  scored.sort(key=lambda x: x[1], reverse=True)
343
  return scored[:top_k]
344
 
345
-
346
  def search_knowledge_base(query: str, top_k: int = 10) -> dict:
347
  query_embedding = model.encode(query).tolist()
348
- res = collection.query(query_embeddings=[query_embedding], n_results=top_k, include=['documents', 'metadatas', 'distances'])
 
 
 
 
349
  documents = (res.get("documents", [[]]) or [[]])[0]
350
  metadatas = (res.get("metadatas", [[]]) or [[]])[0]
351
  distances = (res.get("distances", [[]]) or [[]])[0]
 
 
352
  ids: List[str] = []
353
  if documents:
354
  synthesized = []
@@ -358,8 +381,22 @@ def search_knowledge_base(query: str, top_k: int = 10) -> dict:
358
  idx = (m or {}).get("chunk_index", i)
359
  synthesized.append(f"{fn}:{sec}:{idx}")
360
  ids = synthesized
361
- return {"documents": documents, "metadatas": metadatas, "distances": distances, "ids": ids}
362
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
363
  ERROR_INTENT_TERMS = [
364
  "error", "issue", "fail", "not working", "resolution", "fix",
365
  "permission", "permissions", "access", "no access", "authorization", "authorisation",
@@ -410,12 +447,12 @@ def _action_weight(text: str, actions: List[str]) -> float:
410
  for syn in ACTION_SYNONYMS.get(act, [act]):
411
  if syn in t:
412
  score += 1.0
413
- conflicts = {"create": ["update"], "update": ["create"], "delete": ["create", "update"], "navigate": []}
414
  for act in actions:
415
  for bad in conflicts.get(act, []):
416
  for syn in ACTION_SYNONYMS.get(bad, [bad]):
417
  if syn in t:
418
- score -= 1.0
419
  return score
420
 
421
  def _module_weight(meta: Dict[str, Any], user_modules: List[str]) -> float:
@@ -439,11 +476,12 @@ def _intent_weight(meta: dict, user_intent: str) -> float:
439
  st = ((meta or {}).get("section", "") or "").lower()
440
  topics = (meta or {}).get("topic_tags", "") or ""
441
  topic_list = [t.strip() for t in topics.split(",") if t.strip()]
 
442
  if user_intent == "errors" and (
443
- any(k in st for k in ["common errors", "known issues", "common issues", "errors", "escalation", "permissions", "access"]) or
444
- ("permissions" in topic_list)
445
  ):
446
- return 1.10
447
  if user_intent == "steps" and any(k in st for k in ["inbound receiving", "receiving", "goods receipt", "grn"]):
448
  return 0.75
449
  return -0.2
@@ -480,6 +518,7 @@ def _phrase_boost_score(text: str, q_terms: List[str]) -> float:
480
  return min(score, 2.0)
481
 
482
  def _literal_query_match_boost(text: str, query_norm: str) -> float:
 
483
  t = (text or "").lower()
484
  q = (query_norm or "").lower()
485
  boost = 0.0
@@ -493,22 +532,6 @@ def _literal_query_match_boost(text: str, query_norm: str) -> float:
493
  break
494
  return min(boost, 1.6)
495
 
496
- def _action_meta_weight(meta: Dict[str, Any], user_actions: List[str]) -> float:
497
- raw = (meta or {}).get("action_tags", "") or ""
498
- doc_actions = [a.strip().lower() for a in raw.split(",") if a.strip()] if isinstance(raw, str) else (raw or [])
499
- if not user_actions:
500
- return 0.0
501
- ua = set(a.lower() for a in user_actions)
502
- da = set(doc_actions)
503
- overlap = len(ua & da)
504
- if overlap > 0:
505
- return 1.2 * overlap
506
- if "update" in ua and "create" in da and "update" not in da:
507
- return -1.4
508
- if "create" in ua and "update" in da and "create" not in da:
509
- return -1.2
510
- return -0.4
511
-
512
  def hybrid_search_knowledge_base(query: str, top_k: int = 10, alpha: float = 0.6, beta: float = 0.4) -> dict:
513
  norm_query = _normalize_query(query)
514
  q_terms = _tokenize(norm_query)
@@ -529,12 +552,12 @@ def hybrid_search_knowledge_base(query: str, top_k: int = 10, alpha: float = 0.6
529
  return 1.0 / (1.0 + float(d))
530
  except Exception:
531
  return 0.0
 
532
  sem_sims = [dist_to_sim(d) for d in sem_dists]
533
 
534
  bm25_hits = bm25_search(norm_query, top_k=max(80, top_k * 6))
535
  bm25_max = max([s for _, s in bm25_hits], default=1.0)
536
  bm25_norm_pairs = [(idx, (score / bm25_max) if bm25_max > 0 else 0.0) for idx, score in bm25_hits]
537
-
538
  bm25_id_to_norm, bm25_id_to_text, bm25_id_to_meta = {}, {}, {}
539
  for idx, nscore in bm25_norm_pairs:
540
  d = bm25_docs[idx]
@@ -544,17 +567,15 @@ def hybrid_search_knowledge_base(query: str, top_k: int = 10, alpha: float = 0.6
544
 
545
  union_ids = set(sem_ids) | set(bm25_id_to_norm.keys())
546
 
547
- gamma = 0.30
548
- delta = 0.55
549
- epsilon = 0.50
550
- zeta = 0.65
551
- eta = 0.50
552
- theta = 0.40
553
- iota = 0.60
554
- kappa = 0.90
555
-
556
- combined_records_ext: List[Tuple[str, float, float, str, Dict[str, Any], float, float, float, float, float, float, float, float]] = []
557
 
 
558
  for cid in union_ids:
559
  if cid in sem_ids:
560
  pos = sem_ids.index(cid)
@@ -564,17 +585,21 @@ def hybrid_search_knowledge_base(query: str, top_k: int = 10, alpha: float = 0.6
564
  sem_meta = sem_metas[pos] if pos < len(sem_metas) else {}
565
  else:
566
  sem_sim, sem_dist, sem_text, sem_meta = 0.0, None, "", {}
 
567
  bm25_sim = bm25_id_to_norm.get(cid, 0.0)
568
  bm25_text = bm25_id_to_text.get(cid, "")
569
  bm25_meta = bm25_id_to_meta.get(cid, {})
 
570
  text = sem_text if sem_text else bm25_text
571
  meta = sem_meta if sem_meta else bm25_meta
 
572
  m_overlap = _meta_overlap(meta, q_terms)
573
  intent_boost = _intent_weight(meta, user_intent)
574
  act_wt = _action_weight(text, actions)
575
  mod_wt = _module_weight(meta, user_modules)
576
  phrase_wt = _phrase_boost_score(text, q_terms)
577
  literal_wt = _literal_query_match_boost(text, norm_query)
 
578
  sec_low = ((meta or {}).get("section", "") or "").lower()
579
  title_low = ((meta or {}).get("title", "") or "").lower()
580
  heading_bonus = 0.0
@@ -584,32 +609,49 @@ def hybrid_search_knowledge_base(query: str, top_k: int = 10, alpha: float = 0.6
584
  heading_bonus += 0.40
585
  if any(root in sec_low for root in ["appointment", "appointments", "schedule"]) and "receiv" in norm_query:
586
  heading_bonus -= 0.35
587
- action_meta_wt = _action_meta_weight(meta, actions)
588
  final_score = (
589
- alpha * sem_sim + beta * bm25_sim + gamma * m_overlap + delta * intent_boost + epsilon * act_wt + zeta * mod_wt + eta * phrase_wt + theta * heading_bonus + iota * literal_wt + kappa * action_meta_wt
 
 
 
 
 
 
 
 
 
 
 
 
590
  )
591
- combined_records_ext.append((cid, final_score, (sem_dist if sem_dist is not None else 999.0), text, meta, m_overlap, intent_boost, act_wt, mod_wt, phrase_wt, heading_bonus, literal_wt, action_meta_wt))
592
 
593
- if _detect_user_intent(query) == "errors":
 
594
  exact_hits = []
595
  for rec in combined_records_ext:
596
  text_lower = (rec[3] or "").lower()
597
- if any(phrase in text_lower for phrase in [norm_query, *(_make_ngrams([tok for tok in norm_query.split() if len(tok) > 2], 2))]):
 
 
 
 
598
  exact_hits.append(rec)
599
  if exact_hits:
 
600
  rest = [r for r in combined_records_ext if r not in exact_hits]
601
  exact_hits.sort(key=lambda x: x[1], reverse=True)
602
  rest.sort(key=lambda x: x[1], reverse=True)
603
  combined_records_ext = exact_hits + rest
604
 
605
  from collections import defaultdict
606
- doc_groups: Dict[str, List[Tuple[str, float, float, str, Dict[str, Any], float, float, float, float, float, float, float, float]]] = defaultdict(list)
607
  for rec in combined_records_ext:
608
  meta = rec[4] or {}
609
  fn = meta.get("filename", "unknown")
610
  doc_groups[fn].append(rec)
611
 
612
- def doc_prior(recs: List[Tuple[str, float, float, str, Dict[str, Any], float, float, float, float, float, float, float, float]]) -> float:
613
  total_score = sum(r[1] for r in recs)
614
  total_overlap = sum(r[5] for r in recs)
615
  total_intent = sum(max(0.0, r[6]) for r in recs)
@@ -618,13 +660,23 @@ def hybrid_search_knowledge_base(query: str, top_k: int = 10, alpha: float = 0.6
618
  total_phrase = sum(r[9] for r in recs)
619
  total_heading = sum(r[10] for r in recs)
620
  total_literal = sum(r[11] for r in recs)
621
- total_action_meta = sum(r[12] for r in recs)
622
  total_penalty = sum(min(0.0, r[6]) for r in recs) + sum(min(0.0, r[7]) for r in recs)
 
623
  errors_section_bonus = 0.0
624
- if any("error" in ((r[4] or {}).get("section", "")).lower() or "known issues" in ((r[4] or {}).get("section", "")).lower() or "common issues" in ((r[4] or {}).get("section", "")).lower() for r in recs):
 
625
  errors_section_bonus = 0.5
626
  return (
627
- total_score + 0.4 * total_overlap + 0.7 * total_intent + 0.5 * total_action + 0.8 * total_module + 0.6 * total_phrase + 0.6 * total_heading + 0.7 * total_literal + 0.9 * total_action_meta + errors_section_bonus + 0.3 * total_penalty
 
 
 
 
 
 
 
 
 
628
  )
629
 
630
  best_doc, best_doc_prior = None, -1.0
@@ -634,22 +686,35 @@ def hybrid_search_knowledge_base(query: str, top_k: int = 10, alpha: float = 0.6
634
  best_doc_prior, best_doc = p, fn
635
 
636
  best_recs = sorted(doc_groups.get(best_doc, []), key=lambda x: x[1], reverse=True)
637
- other_recs: List[Tuple[str, float, float, str, Dict[str, Any], float, float, float, float, float, float, float, float]] = []
638
  for fn, recs in doc_groups.items():
639
  if fn == best_doc:
640
  continue
641
  other_recs.extend(recs)
642
  other_recs.sort(key=lambda x: x[1], reverse=True)
 
643
  reordered = best_recs + other_recs
644
  top = reordered[:top_k]
 
645
  documents = [t[3] for t in top]
646
  metadatas = [t[4] for t in top]
647
  distances = [t[2] for t in top]
648
  ids = [t[0] for t in top]
649
  combined_scores = [t[1] for t in top]
650
- return {"documents": documents, "metadatas": metadatas, "distances": distances, "ids": ids, "combined_scores": combined_scores, "best_doc": best_doc, "best_doc_prior": best_doc_prior, "user_intent": user_intent, "actions": actions}
651
 
 
 
 
 
 
 
 
 
 
 
 
652
 
 
653
  def get_section_text(filename: str, section: str) -> str:
654
  texts: List[str] = []
655
  for d in bm25_docs:
@@ -658,7 +723,7 @@ def get_section_text(filename: str, section: str) -> str:
658
  t = (d.get("text") or "").strip()
659
  if t:
660
  texts.append(t)
661
- return "".join(texts).strip()
662
 
663
  def get_best_steps_section_text(filename: str) -> str:
664
  texts: List[str] = []
@@ -668,13 +733,13 @@ def get_best_steps_section_text(filename: str) -> str:
668
  t = (d.get("text") or "").strip()
669
  if t:
670
  texts.append(t)
671
- return "".join(texts).strip()
672
 
673
  def get_best_errors_section_text(filename: str) -> str:
674
  texts: List[str] = []
675
  for d in bm25_docs:
676
  m = d.get("meta", {})
677
- sec = ((m.get("section") or "").lower())
678
  topics = (m.get("topic_tags") or "")
679
  topic_list = [t.strip() for t in topics.split(",") if t.strip()]
680
  if m.get("filename") == filename and (
@@ -691,26 +756,9 @@ def get_best_errors_section_text(filename: str) -> str:
691
  t = (d.get("text") or "").strip()
692
  if t:
693
  texts.append(t)
694
- return "".join(texts).strip()
695
-
696
- def get_steps_text_by_action(filename: str, preferred_actions: List[str]) -> str:
697
- if not preferred_actions:
698
- return ""
699
- actions = set(a.lower() for a in preferred_actions)
700
- texts: List[str] = []
701
- for d in bm25_docs:
702
- m = d.get("meta", {})
703
- if m.get("filename") != filename or m.get("intent_tag") != "steps":
704
- continue
705
- raw = (m.get("action_tags") or "").lower()
706
- doc_actions = set(a.strip() for a in raw.split(",") if a.strip())
707
- if actions & doc_actions:
708
- t = (d.get("text") or "").strip()
709
- if t:
710
- texts.append(t)
711
- return "".join(texts).strip()
712
-
713
 
 
714
  def get_kb_runtime_info() -> Dict[str, Any]:
715
  return {
716
  "chroma_path": CHROMA_PATH,
@@ -730,11 +778,13 @@ def reset_kb(folder_path: str) -> Dict[str, Any]:
730
  pass
731
  global collection
732
  collection = client.get_or_create_collection(name="knowledge_base")
 
733
  try:
734
  if os.path.isfile(BM25_INDEX_FILE):
735
  os.remove(BM25_INDEX_FILE)
736
  except Exception as e:
737
  result.setdefault("warnings", []).append(f"bm25 index delete: {e}")
 
738
  os.makedirs(CHROMA_PATH, exist_ok=True)
739
  ingest_documents(folder_path)
740
  result["info"] = get_kb_runtime_info()
 
 
1
  # services/kb_creation.py
2
  import os
3
  import re
 
7
  from sentence_transformers import SentenceTransformer
8
  import chromadb
9
 
10
+ # ---------------------------- ChromaDB setup ----------------------------
11
  CHROMA_PATH = os.path.join(os.getcwd(), "chroma_db")
12
  client = chromadb.PersistentClient(path=CHROMA_PATH)
13
  collection = client.get_or_create_collection(name="knowledge_base")
14
 
15
+ # ---------------------------- Embedding model ----------------------------
16
  model = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2")
17
 
18
+ # ---------------------------- BM25 (lightweight) ----------------------------
19
  BM25_INDEX_FILE = os.path.join(CHROMA_PATH, "bm25_index.pkl")
20
  bm25_docs: List[Dict[str, Any]] = []
21
  bm25_inverted: Dict[str, List[int]] = {}
 
25
  BM25_K1 = 1.5
26
  BM25_B = 0.75
27
 
28
+ # ---------------------------- Utilities ----------------------------
29
  def _tokenize(text: str) -> List[str]:
30
  if not text:
31
  return []
 
41
  def _tokenize_meta_value(val: Optional[str]) -> List[str]:
42
  return _tokenize(val or "")
43
 
44
+ # ---------------------------- DOCX parsing & chunking ----------------------------
45
+ BULLET_RE = re.compile(r"^\s*(?:[\-\*\u2022]|\d+[.)])\s+", re.IGNORECASE)
46
 
47
  def _split_by_sections(doc: Document) -> List[Tuple[str, List[str]]]:
48
  sections: List[Tuple[str, List[str]]] = []
 
68
  return sections
69
 
70
  def _paragraphs_to_lines(paragraphs: List[str]) -> List[str]:
71
+ """
72
+ Split paragraphs into bullet-aware lines:
73
+ - Preserve bullets/numbered list lines as separate atomic lines.
74
+ - Split long paragraphs by sentence boundaries.
75
+ """
76
  lines: List[str] = []
77
  for p in (paragraphs or []):
78
  p = (p or "").strip()
 
81
  if BULLET_RE.match(p):
82
  lines.append(p)
83
  continue
84
+ parts = [s.strip() for s in re.split(r"(?<=[.!?])\s+", p) if s.strip()]
85
  lines.extend(parts)
86
  return lines
87
 
88
  def _chunk_text_with_context(doc_title: str, section_title: str, paragraphs: List[str], max_words: int = 160) -> List[str]:
89
+ """
90
+ Smaller chunks (≈160 words), bullet-aware for better recall of error bullets.
91
+ """
92
  lines = _paragraphs_to_lines(paragraphs)
93
  chunks: List[str] = []
94
  current: List[str] = []
95
  current_len = 0
96
+
97
  for ln in lines:
98
  w = ln.split()
99
  if current_len + len(w) > max_words or (BULLET_RE.match(ln) and current):
 
105
  else:
106
  current.append(ln)
107
  current_len += len(w)
108
+
109
  if current:
110
  chunk = " ".join(current).strip()
111
  if chunk:
112
  chunks.append(chunk)
113
+
114
  if not chunks:
115
  body = " ".join(lines).strip()
116
  if body:
117
  chunks = [body]
118
  return chunks
119
 
120
+ # ---------------------------- Intent & Module tagging ----------------------------
121
  SECTION_STEPS_HINTS = ["process steps", "procedure", "how to", "workflow", "instructions", "steps"]
122
  SECTION_ERRORS_HINTS = ["common errors", "resolution", "troubleshooting", "known issues", "common issues", "escalation", "escalation path", "permissions", "access"]
123
+
124
  PERMISSION_TERMS = [
125
  "permission", "permissions", "access", "access right", "authorization", "authorisation",
126
  "role", "role access", "role mapping", "security", "security profile", "privilege", "insufficient",
 
128
  ]
129
  ERROR_TERMS = ["error", "issue", "fail", "failure", "not working", "cannot", "can't", "mismatch", "locked", "wrong", "denied"]
130
  STEP_VERBS = ["navigate", "select", "scan", "verify", "confirm", "print", "move", "complete", "click", "open", "choose", "enter", "update", "save", "delete", "create", "attach", "assign"]
131
+
132
  MODULE_VOCAB = {
133
+ "receiving": [
134
+ "receive", "receiving", "inbound receiving", "inbound", "goods receipt", "grn",
135
+ "asn receiving", "unload", "check-in", "dock check-in"
136
+ ],
137
+ "appointments": [
138
+ "appointment", "appointments", "schedule", "scheduling", "slot", "dock door",
139
+ "appointment creation", "appointment details"
140
+ ],
141
  "picking": ["pick", "picking", "pick release", "wave", "allocation"],
142
  "putaway": ["putaway", "staging", "put away", "location assignment"],
143
  "shipping": ["shipping", "ship confirm", "outbound", "load", "trailer"],
 
195
  found = ["appointments"]
196
  return list(sorted(set(found)))
197
 
198
+ # ---------------------------- Ingestion ----------------------------
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
199
  def ingest_documents(folder_path: str) -> None:
200
  print(f"[KB] Checking folder: {folder_path}")
201
  files = [f for f in os.listdir(folder_path) if f.lower().endswith('.docx')]
 
203
  if not files:
204
  print("[KB] WARNING: No .docx files found. Please check the folder path.")
205
  return
206
+
207
  global bm25_docs, bm25_inverted, bm25_df, bm25_avgdl, bm25_ready
208
  bm25_docs, bm25_inverted, bm25_df = [], {}, {}
209
  bm25_avgdl, bm25_ready = 0.0, False
210
+
211
  for file in files:
212
  file_path = os.path.join(folder_path, file)
213
  doc_title = os.path.splitext(file)[0]
214
  doc = Document(file_path)
215
  sections = _split_by_sections(doc)
216
  total_chunks = 0
217
+
218
  for s_idx, (section_title, paras) in enumerate(sections):
219
  chunks = _chunk_text_with_context(doc_title, section_title, paras, max_words=160)
220
  total_chunks += len(chunks)
221
+
222
  base_intent = _infer_intent_tag(section_title)
223
+
224
  for c_idx, chunk in enumerate(chunks):
225
  derived_intent, topic_tags = _derive_semantic_intent_from_text(chunk)
226
  final_intent = base_intent
 
228
  final_intent = "errors"
229
  elif base_intent == "neutral" and derived_intent in ("steps", "prereqs"):
230
  final_intent = derived_intent
231
+
232
  module_tags = _derive_module_tags(chunk, file, section_title)
 
233
  embedding = model.encode(chunk).tolist()
234
  doc_id = f"{file}:{s_idx}:{c_idx}"
235
  meta = {
 
241
  "intent_tag": final_intent,
242
  "topic_tags": ", ".join(topic_tags) if topic_tags else "",
243
  "module_tags": ", ".join(module_tags) if module_tags else "",
 
244
  }
245
  try:
246
  collection.add(ids=[doc_id], embeddings=[embedding], documents=[chunk], metadatas=[meta])
 
250
  collection.add(ids=[doc_id], embeddings=[embedding], documents=[chunk], metadatas=[meta])
251
  except Exception as e2:
252
  print(f"[KB] ERROR: Upsert failed for {doc_id}: {e2}")
253
+
254
  tokens = _tokenize(chunk)
255
  tf: Dict[str, int] = {}
256
  for tkn in tokens:
257
  tf[tkn] = tf.get(tkn, 0) + 1
258
+
259
  idx = len(bm25_docs)
260
  bm25_docs.append({
261
  "id": doc_id,
 
271
  if term not in seen:
272
  bm25_df[term] = bm25_df.get(term, 0) + 1
273
  seen.add(term)
274
+
275
  print(f"[KB] Ingested {file} → {total_chunks} chunks")
276
+
277
  N = len(bm25_docs)
278
  if N > 0:
279
  bm25_avgdl = sum(d["length"] for d in bm25_docs) / float(N)
280
  bm25_ready = True
281
+
282
  payload = {
283
  "bm25_docs": bm25_docs,
284
  "bm25_inverted": bm25_inverted,
 
293
  print(f"[KB] BM25 index saved: {BM25_INDEX_FILE}")
294
  print(f"[KB] Documents ingested. Total entries in Chroma: {collection.count()}")
295
 
296
+ # ---------------------------- BM25 load ----------------------------
297
  def _load_bm25_index() -> None:
298
  global bm25_docs, bm25_inverted, bm25_df, bm25_avgdl, bm25_ready
299
  if not os.path.exists(BM25_INDEX_FILE):
 
313
 
314
  _load_bm25_index()
315
 
316
+ # ---------------------------- BM25 search ----------------------------
317
  def _bm25_score_for_doc(query_terms: List[str], doc_idx: int) -> float:
318
  if not bm25_ready or doc_idx < 0 or doc_idx >= len(bm25_docs):
319
  return 0.0
 
328
  if tf == 0:
329
  continue
330
  N = len(bm25_docs)
331
+ idf_ratio = ((N - df + 0.5) / (df + 0.5))
332
  try:
333
  import math
334
+ idf = math.log(idf_ratio + 1.0)
335
  except Exception:
336
  idf = 1.0
337
  denom = tf + BM25_K1 * (1 - BM25_B + BM25_B * (dl / (bm25_avgdl or 1.0)))
 
359
  scored.sort(key=lambda x: x[1], reverse=True)
360
  return scored[:top_k]
361
 
362
+ # ---------------------------- Semantic-only ----------------------------
363
  def search_knowledge_base(query: str, top_k: int = 10) -> dict:
364
  query_embedding = model.encode(query).tolist()
365
+ res = collection.query(
366
+ query_embeddings=[query_embedding],
367
+ n_results=top_k,
368
+ include=['documents', 'metadatas', 'distances'] # no 'ids'
369
+ )
370
  documents = (res.get("documents", [[]]) or [[]])[0]
371
  metadatas = (res.get("metadatas", [[]]) or [[]])[0]
372
  distances = (res.get("distances", [[]]) or [[]])[0]
373
+
374
+ # Synthesize IDs from metadata (filename:section:chunk_index)
375
  ids: List[str] = []
376
  if documents:
377
  synthesized = []
 
381
  idx = (m or {}).get("chunk_index", i)
382
  synthesized.append(f"{fn}:{sec}:{idx}")
383
  ids = synthesized
 
384
 
385
+ print(f"[KB] search → {len(documents)} docs (top_k={top_k}); first distance: {distances[0] if distances else 'n/a'}; ids synthesized={len(ids)}")
386
+ return {
387
+ "documents": documents,
388
+ "metadatas": metadatas,
389
+ "distances": distances,
390
+ "ids": ids,
391
+ }
392
+
393
+ # ---------------------------- Hybrid search (improved + exact-match rerank) ----------------------------
394
+ ACTION_SYNONYMS = {
395
+ "create": ["create", "creation", "add", "new", "generate"],
396
+ "update": ["update", "modify", "change", "edit"],
397
+ "delete": ["delete", "remove"],
398
+ "navigate": ["navigate", "go to", "open"],
399
+ }
400
  ERROR_INTENT_TERMS = [
401
  "error", "issue", "fail", "not working", "resolution", "fix",
402
  "permission", "permissions", "access", "no access", "authorization", "authorisation",
 
447
  for syn in ACTION_SYNONYMS.get(act, [act]):
448
  if syn in t:
449
  score += 1.0
450
+ conflicts = {"create": ["delete"], "delete": ["create"], "update": ["delete"], "navigate": []}
451
  for act in actions:
452
  for bad in conflicts.get(act, []):
453
  for syn in ACTION_SYNONYMS.get(bad, [bad]):
454
  if syn in t:
455
+ score -= 0.8
456
  return score
457
 
458
  def _module_weight(meta: Dict[str, Any], user_modules: List[str]) -> float:
 
476
  st = ((meta or {}).get("section", "") or "").lower()
477
  topics = (meta or {}).get("topic_tags", "") or ""
478
  topic_list = [t.strip() for t in topics.split(",") if t.strip()]
479
+ # Prefer errors sections strongly
480
  if user_intent == "errors" and (
481
+ any(k in st for k in ["common errors", "known issues", "common issues", "errors", "escalation", "permissions", "access"])
482
+ or ("permissions" in topic_list)
483
  ):
484
+ return 1.10 # stronger than before
485
  if user_intent == "steps" and any(k in st for k in ["inbound receiving", "receiving", "goods receipt", "grn"]):
486
  return 0.75
487
  return -0.2
 
518
  return min(score, 2.0)
519
 
520
  def _literal_query_match_boost(text: str, query_norm: str) -> float:
521
+ """Extra boost if exact normalized query substring or bigrams appear."""
522
  t = (text or "").lower()
523
  q = (query_norm or "").lower()
524
  boost = 0.0
 
532
  break
533
  return min(boost, 1.6)
534
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
535
  def hybrid_search_knowledge_base(query: str, top_k: int = 10, alpha: float = 0.6, beta: float = 0.4) -> dict:
536
  norm_query = _normalize_query(query)
537
  q_terms = _tokenize(norm_query)
 
552
  return 1.0 / (1.0 + float(d))
553
  except Exception:
554
  return 0.0
555
+
556
  sem_sims = [dist_to_sim(d) for d in sem_dists]
557
 
558
  bm25_hits = bm25_search(norm_query, top_k=max(80, top_k * 6))
559
  bm25_max = max([s for _, s in bm25_hits], default=1.0)
560
  bm25_norm_pairs = [(idx, (score / bm25_max) if bm25_max > 0 else 0.0) for idx, score in bm25_hits]
 
561
  bm25_id_to_norm, bm25_id_to_text, bm25_id_to_meta = {}, {}, {}
562
  for idx, nscore in bm25_norm_pairs:
563
  d = bm25_docs[idx]
 
567
 
568
  union_ids = set(sem_ids) | set(bm25_id_to_norm.keys())
569
 
570
+ gamma = 0.30 # meta overlap
571
+ delta = 0.55 # intent boost (stronger)
572
+ epsilon = 0.30 # action weight
573
+ zeta = 0.65 # module weight
574
+ eta = 0.50 # phrase-level boost (stronger)
575
+ theta = 0.40 # heading alignment bonus
576
+ iota = 0.60 # literal query match boost (stronger)
 
 
 
577
 
578
+ combined_records_ext: List[Tuple[str, float, float, str, Dict[str, Any], float, float, float, float, float, float, float]] = []
579
  for cid in union_ids:
580
  if cid in sem_ids:
581
  pos = sem_ids.index(cid)
 
585
  sem_meta = sem_metas[pos] if pos < len(sem_metas) else {}
586
  else:
587
  sem_sim, sem_dist, sem_text, sem_meta = 0.0, None, "", {}
588
+
589
  bm25_sim = bm25_id_to_norm.get(cid, 0.0)
590
  bm25_text = bm25_id_to_text.get(cid, "")
591
  bm25_meta = bm25_id_to_meta.get(cid, {})
592
+
593
  text = sem_text if sem_text else bm25_text
594
  meta = sem_meta if sem_meta else bm25_meta
595
+
596
  m_overlap = _meta_overlap(meta, q_terms)
597
  intent_boost = _intent_weight(meta, user_intent)
598
  act_wt = _action_weight(text, actions)
599
  mod_wt = _module_weight(meta, user_modules)
600
  phrase_wt = _phrase_boost_score(text, q_terms)
601
  literal_wt = _literal_query_match_boost(text, norm_query)
602
+
603
  sec_low = ((meta or {}).get("section", "") or "").lower()
604
  title_low = ((meta or {}).get("title", "") or "").lower()
605
  heading_bonus = 0.0
 
609
  heading_bonus += 0.40
610
  if any(root in sec_low for root in ["appointment", "appointments", "schedule"]) and "receiv" in norm_query:
611
  heading_bonus -= 0.35
612
+
613
  final_score = (
614
+ alpha * sem_sim
615
+ + beta * bm25_sim
616
+ + gamma * m_overlap
617
+ + delta * intent_boost
618
+ + epsilon * act_wt
619
+ + zeta * mod_wt
620
+ + eta * phrase_wt
621
+ + theta * heading_bonus
622
+ + iota * literal_wt
623
+ )
624
+
625
+ combined_records_ext.append(
626
+ (cid, final_score, (sem_dist if sem_dist is not None else 999.0), text, meta, m_overlap, intent_boost, act_wt, mod_wt, phrase_wt, heading_bonus, literal_wt)
627
  )
 
628
 
629
+ # ---- Exact-match rerank for errors ----
630
+ if user_intent == "errors":
631
  exact_hits = []
632
  for rec in combined_records_ext:
633
  text_lower = (rec[3] or "").lower()
634
+ if any(phrase in text_lower for phrase in [
635
+ norm_query, # whole normalized query
636
+ # common 2-gram patterns extracted from the query
637
+ *(_make_ngrams([tok for tok in norm_query.split() if len(tok) > 2], 2))
638
+ ]):
639
  exact_hits.append(rec)
640
  if exact_hits:
641
+ # Move exact hits to front and keep order by current final_score
642
  rest = [r for r in combined_records_ext if r not in exact_hits]
643
  exact_hits.sort(key=lambda x: x[1], reverse=True)
644
  rest.sort(key=lambda x: x[1], reverse=True)
645
  combined_records_ext = exact_hits + rest
646
 
647
  from collections import defaultdict
648
+ doc_groups: Dict[str, List[Tuple[str, float, float, str, Dict[str, Any], float, float, float, float, float, float, float]]] = defaultdict(list)
649
  for rec in combined_records_ext:
650
  meta = rec[4] or {}
651
  fn = meta.get("filename", "unknown")
652
  doc_groups[fn].append(rec)
653
 
654
+ def doc_prior(recs: List[Tuple[str, float, float, str, Dict[str, Any], float, float, float, float, float, float, float]]) -> float:
655
  total_score = sum(r[1] for r in recs)
656
  total_overlap = sum(r[5] for r in recs)
657
  total_intent = sum(max(0.0, r[6]) for r in recs)
 
660
  total_phrase = sum(r[9] for r in recs)
661
  total_heading = sum(r[10] for r in recs)
662
  total_literal = sum(r[11] for r in recs)
 
663
  total_penalty = sum(min(0.0, r[6]) for r in recs) + sum(min(0.0, r[7]) for r in recs)
664
+ # Errors doc prior: if many chunks are from an errors/known issues section, add a bonus
665
  errors_section_bonus = 0.0
666
+ if any("error" in ((r[4] or {}).get("section", "")).lower() or "known issues" in ((r[4] or {}).get("section", "")).lower()
667
+ or "common issues" in ((r[4] or {}).get("section", "")).lower() for r in recs):
668
  errors_section_bonus = 0.5
669
  return (
670
+ total_score
671
+ + 0.4 * total_overlap
672
+ + 0.7 * total_intent
673
+ + 0.5 * total_action
674
+ + 0.8 * total_module
675
+ + 0.6 * total_phrase
676
+ + 0.6 * total_heading
677
+ + 0.7 * total_literal
678
+ + errors_section_bonus
679
+ + 0.3 * total_penalty
680
  )
681
 
682
  best_doc, best_doc_prior = None, -1.0
 
686
  best_doc_prior, best_doc = p, fn
687
 
688
  best_recs = sorted(doc_groups.get(best_doc, []), key=lambda x: x[1], reverse=True)
689
+ other_recs: List[Tuple[str, float, float, str, Dict[str, Any], float, float, float, float, float, float, float]] = []
690
  for fn, recs in doc_groups.items():
691
  if fn == best_doc:
692
  continue
693
  other_recs.extend(recs)
694
  other_recs.sort(key=lambda x: x[1], reverse=True)
695
+
696
  reordered = best_recs + other_recs
697
  top = reordered[:top_k]
698
+
699
  documents = [t[3] for t in top]
700
  metadatas = [t[4] for t in top]
701
  distances = [t[2] for t in top]
702
  ids = [t[0] for t in top]
703
  combined_scores = [t[1] for t in top]
 
704
 
705
+ return {
706
+ "documents": documents,
707
+ "metadatas": metadatas,
708
+ "distances": distances,
709
+ "ids": ids,
710
+ "combined_scores": combined_scores,
711
+ "best_doc": best_doc,
712
+ "best_doc_prior": best_doc_prior,
713
+ "user_intent": user_intent,
714
+ "actions": actions,
715
+ }
716
 
717
+ # ---------------------------- Section fetch helpers ----------------------------
718
  def get_section_text(filename: str, section: str) -> str:
719
  texts: List[str] = []
720
  for d in bm25_docs:
 
723
  t = (d.get("text") or "").strip()
724
  if t:
725
  texts.append(t)
726
+ return "\n\n".join(texts).strip()
727
 
728
  def get_best_steps_section_text(filename: str) -> str:
729
  texts: List[str] = []
 
733
  t = (d.get("text") or "").strip()
734
  if t:
735
  texts.append(t)
736
+ return "\n\n".join(texts).strip()
737
 
738
  def get_best_errors_section_text(filename: str) -> str:
739
  texts: List[str] = []
740
  for d in bm25_docs:
741
  m = d.get("meta", {})
742
+ sec = (m.get("section") or "").lower()
743
  topics = (m.get("topic_tags") or "")
744
  topic_list = [t.strip() for t in topics.split(",") if t.strip()]
745
  if m.get("filename") == filename and (
 
756
  t = (d.get("text") or "").strip()
757
  if t:
758
  texts.append(t)
759
+ return "\n\n".join(texts).strip()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
760
 
761
+ # ---------------------------- Admin helpers ----------------------------
762
  def get_kb_runtime_info() -> Dict[str, Any]:
763
  return {
764
  "chroma_path": CHROMA_PATH,
 
778
  pass
779
  global collection
780
  collection = client.get_or_create_collection(name="knowledge_base")
781
+
782
  try:
783
  if os.path.isfile(BM25_INDEX_FILE):
784
  os.remove(BM25_INDEX_FILE)
785
  except Exception as e:
786
  result.setdefault("warnings", []).append(f"bm25 index delete: {e}")
787
+
788
  os.makedirs(CHROMA_PATH, exist_ok=True)
789
  ingest_documents(folder_path)
790
  result["info"] = get_kb_runtime_info()