srilakshu012456 commited on
Commit
125282c
·
verified ·
1 Parent(s): a226b7b

Update services/kb_creation.py

Browse files
Files changed (1) hide show
  1. services/kb_creation.py +87 -35
services/kb_creation.py CHANGED
@@ -43,6 +43,8 @@ def _tokenize_meta_value(val: Optional[str]) -> List[str]:
43
  return _tokenize(val or "")
44
 
45
  # ---------------------------- DOCX parsing & chunking ----------------------------
 
 
46
  def _split_by_sections(doc: Document) -> List[Tuple[str, List[str]]]:
47
  sections: List[Tuple[str, List[str]]] = []
48
  current_title = None
@@ -66,18 +68,57 @@ def _split_by_sections(doc: Document) -> List[Tuple[str, List[str]]]:
66
  sections = [("Document", all_text)]
67
  return sections
68
 
69
- def _chunk_text_with_context(doc_title: str, section_title: str, paragraphs: List[str], max_words: int = 900) -> List[str]:
70
- body = "\n".join(paragraphs).strip()
71
- if not body:
72
- return []
73
- words = body.split()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
74
  chunks: List[str] = []
75
- for i in range(0, len(words), max_words):
76
- chunk_body = ' '.join(words[i:i + max_words]).strip()
77
- if chunk_body:
78
- chunks.append(chunk_body)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
79
  if not chunks:
80
- chunks = [body]
 
 
81
  return chunks
82
 
83
  # ---------------------------- Intent & Module tagging ----------------------------
@@ -89,7 +130,7 @@ PERMISSION_TERMS = [
89
  "role", "role access", "role mapping", "security", "security profile", "privilege", "insufficient",
90
  "not allowed", "not authorized", "denied", "restrict"
91
  ]
92
- ERROR_TERMS = ["error", "issue", "fail", "failure", "not working", "cannot", "can't"]
93
  STEP_VERBS = ["navigate", "select", "scan", "verify", "confirm", "print", "move", "complete", "click", "open", "choose", "enter", "update", "save", "delete", "create", "attach", "assign"]
94
 
95
  MODULE_VOCAB = {
@@ -179,7 +220,7 @@ def ingest_documents(folder_path: str) -> None:
179
  total_chunks = 0
180
 
181
  for s_idx, (section_title, paras) in enumerate(sections):
182
- chunks = _chunk_text_with_context(doc_title, section_title, paras, max_words=900)
183
  total_chunks += len(chunks)
184
 
185
  base_intent = _infer_intent_tag(section_title)
@@ -214,6 +255,7 @@ def ingest_documents(folder_path: str) -> None:
214
  except Exception as e2:
215
  print(f"[KB] ERROR: Upsert failed for {doc_id}: {e2}")
216
 
 
217
  tokens = _tokenize(chunk)
218
  tf: Dict[str, int] = {}
219
  for tkn in tokens:
@@ -353,7 +395,7 @@ def search_knowledge_base(query: str, top_k: int = 10) -> dict:
353
  "ids": ids,
354
  }
355
 
356
- # ---------------------------- Hybrid search (robust) ----------------------------
357
  ACTION_SYNONYMS = {
358
  "create": ["create", "creation", "add", "new", "generate"],
359
  "update": ["update", "modify", "change", "edit"],
@@ -364,7 +406,7 @@ ERROR_INTENT_TERMS = [
364
  "error", "issue", "fail", "not working", "resolution", "fix",
365
  "permission", "permissions", "access", "no access", "authorization", "authorisation",
366
  "role", "role mapping", "not authorized", "permission denied", "insufficient privileges",
367
- "escalation", "escalation path", "access right"
368
  ]
369
 
370
  def _detect_user_intent(query: str) -> str:
@@ -445,12 +487,6 @@ def _intent_weight(meta: dict, user_intent: str) -> float:
445
  return 0.75
446
  return -0.2
447
 
448
- def _normalize_for_match(text: str) -> str:
449
- t = (text or "").lower()
450
- t = re.sub(r"[^\w\s]", " ", t)
451
- t = re.sub(r"\s+", " ", t).strip()
452
- return t
453
-
454
  def _meta_overlap(meta: Dict[str, Any], q_terms: List[str]) -> float:
455
  fn_tokens = _tokenize_meta_value(meta.get("filename"))
456
  title_tokens = _tokenize_meta_value(meta.get("title"))
@@ -482,20 +518,31 @@ def _phrase_boost_score(text: str, q_terms: List[str]) -> float:
482
  score += 0.60
483
  return min(score, 1.5)
484
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
485
  def hybrid_search_knowledge_base(query: str, top_k: int = 10, alpha: float = 0.6, beta: float = 0.4) -> dict:
486
  norm_query = _normalize_query(query)
487
  q_terms = _tokenize(norm_query)
488
  user_intent = _detect_user_intent(query)
489
-
490
- # Robust guards so missing helpers can’t crash
491
- try:
492
- actions = _extract_actions(query)
493
- except Exception:
494
- actions = []
495
- try:
496
- user_modules = _extract_modules_from_query(query)
497
- except Exception:
498
- user_modules = []
499
 
500
  sem_res = search_knowledge_base(norm_query, top_k=max(top_k, 30))
501
  sem_docs = sem_res.get("documents", [])
@@ -531,8 +578,9 @@ def hybrid_search_knowledge_base(query: str, top_k: int = 10, alpha: float = 0.6
531
  zeta = 0.65 # module weight
532
  eta = 0.45 # phrase-level boost
533
  theta = 0.40 # heading alignment bonus
 
534
 
535
- combined_records_ext: List[Tuple[str, float, float, str, Dict[str, Any], float, float, float, float, float, float]] = []
536
  for cid in union_ids:
537
  if cid in sem_ids:
538
  pos = sem_ids.index(cid)
@@ -555,6 +603,7 @@ def hybrid_search_knowledge_base(query: str, top_k: int = 10, alpha: float = 0.6
555
  act_wt = _action_weight(text, actions)
556
  mod_wt = _module_weight(meta, user_modules)
557
  phrase_wt = _phrase_boost_score(text, q_terms)
 
558
 
559
  sec_low = ((meta or {}).get("section", "") or "").lower()
560
  title_low = ((meta or {}).get("title", "") or "").lower()
@@ -575,20 +624,21 @@ def hybrid_search_knowledge_base(query: str, top_k: int = 10, alpha: float = 0.6
575
  + zeta * mod_wt
576
  + eta * phrase_wt
577
  + theta * heading_bonus
 
578
  )
579
 
580
  combined_records_ext.append(
581
- (cid, final_score, (sem_dist if sem_dist is not None else 999.0), text, meta, m_overlap, intent_boost, act_wt, mod_wt, phrase_wt, heading_bonus)
582
  )
583
 
584
  from collections import defaultdict
585
- doc_groups: Dict[str, List[Tuple[str, float, float, str, Dict[str, Any], float, float, float, float, float, float]]] = defaultdict(list)
586
  for rec in combined_records_ext:
587
  meta = rec[4] or {}
588
  fn = meta.get("filename", "unknown")
589
  doc_groups[fn].append(rec)
590
 
591
- def doc_prior(recs: List[Tuple[str, float, float, str, Dict[str, Any], float, float, float, float, float, float]]) -> float:
592
  total_score = sum(r[1] for r in recs)
593
  total_overlap = sum(r[5] for r in recs)
594
  total_intent = sum(max(0.0, r[6]) for r in recs)
@@ -596,6 +646,7 @@ def hybrid_search_knowledge_base(query: str, top_k: int = 10, alpha: float = 0.6
596
  total_module = sum(r[8] for r in recs)
597
  total_phrase = sum(r[9] for r in recs)
598
  total_heading = sum(r[10] for r in recs)
 
599
  total_penalty = sum(min(0.0, r[6]) for r in recs) + sum(min(0.0, r[7]) for r in recs)
600
  esc_weight = 0.3 if any("escalation" in ((r[4] or {}).get("section", "")).lower() for r in recs) else 0.0
601
  perm_weight = 0.3 if any("permissions" in (((r[4] or {}).get("topic_tags") or [])) for r in recs) else 0.0
@@ -607,6 +658,7 @@ def hybrid_search_knowledge_base(query: str, top_k: int = 10, alpha: float = 0.6
607
  + 0.8 * total_module
608
  + 0.6 * total_phrase
609
  + 0.6 * total_heading
 
610
  + 0.3 * total_penalty
611
  + esc_weight + perm_weight
612
  )
@@ -618,7 +670,7 @@ def hybrid_search_knowledge_base(query: str, top_k: int = 10, alpha: float = 0.6
618
  best_doc_prior, best_doc = p, fn
619
 
620
  best_recs = sorted(doc_groups.get(best_doc, []), key=lambda x: x[1], reverse=True)
621
- other_recs: List[Tuple[str, float, float, str, Dict[str, Any], float, float, float, float, float, float]] = []
622
  for fn, recs in doc_groups.items():
623
  if fn == best_doc:
624
  continue
 
43
  return _tokenize(val or "")
44
 
45
  # ---------------------------- DOCX parsing & chunking ----------------------------
46
+ BULLET_RE = re.compile(r"^\s*(?:[\-\*\u2022]|\d+[.)])\s+", re.IGNORECASE)
47
+
48
  def _split_by_sections(doc: Document) -> List[Tuple[str, List[str]]]:
49
  sections: List[Tuple[str, List[str]]] = []
50
  current_title = None
 
68
  sections = [("Document", all_text)]
69
  return sections
70
 
71
+ def _paragraphs_to_lines(paragraphs: List[str]) -> List[str]:
72
+ """
73
+ Split paragraphs into bullet-aware lines:
74
+ - Preserve bullets and numbered list lines as separate atomic lines.
75
+ - Split long paragraphs by '. ' into manageable lines.
76
+ """
77
+ lines: List[str] = []
78
+ for p in (paragraphs or []):
79
+ p = (p or "").strip()
80
+ if not p:
81
+ continue
82
+ # If looks like a bullet/numbered item, keep as is
83
+ if BULLET_RE.match(p):
84
+ lines.append(p)
85
+ continue
86
+ # Otherwise split by sentence boundaries
87
+ parts = [s.strip() for s in re.split(r"(?<=[.!?])\s+", p) if s.strip()]
88
+ lines.extend(parts)
89
+ return lines
90
+
91
+ def _chunk_text_with_context(doc_title: str, section_title: str, paragraphs: List[str], max_words: int = 300) -> List[str]:
92
+ """
93
+ Smaller chunks for better recall; bullet-aware.
94
+ """
95
+ lines = _paragraphs_to_lines(paragraphs)
96
  chunks: List[str] = []
97
+ current: List[str] = []
98
+ current_len = 0
99
+
100
+ for ln in lines:
101
+ w = ln.split()
102
+ if current_len + len(w) > max_words or (BULLET_RE.match(ln) and current):
103
+ # close current chunk
104
+ chunk = " ".join(current).strip()
105
+ if chunk:
106
+ chunks.append(chunk)
107
+ current = [ln]
108
+ current_len = len(w)
109
+ else:
110
+ current.append(ln)
111
+ current_len += len(w)
112
+
113
+ if current:
114
+ chunk = " ".join(current).strip()
115
+ if chunk:
116
+ chunks.append(chunk)
117
+
118
  if not chunks:
119
+ body = " ".join(lines).strip()
120
+ if body:
121
+ chunks = [body]
122
  return chunks
123
 
124
  # ---------------------------- Intent & Module tagging ----------------------------
 
130
  "role", "role access", "role mapping", "security", "security profile", "privilege", "insufficient",
131
  "not allowed", "not authorized", "denied", "restrict"
132
  ]
133
+ ERROR_TERMS = ["error", "issue", "fail", "failure", "not working", "cannot", "can't", "mismatch", "locked", "wrong", "denied"]
134
  STEP_VERBS = ["navigate", "select", "scan", "verify", "confirm", "print", "move", "complete", "click", "open", "choose", "enter", "update", "save", "delete", "create", "attach", "assign"]
135
 
136
  MODULE_VOCAB = {
 
220
  total_chunks = 0
221
 
222
  for s_idx, (section_title, paras) in enumerate(sections):
223
+ chunks = _chunk_text_with_context(doc_title, section_title, paras, max_words=300)
224
  total_chunks += len(chunks)
225
 
226
  base_intent = _infer_intent_tag(section_title)
 
255
  except Exception as e2:
256
  print(f"[KB] ERROR: Upsert failed for {doc_id}: {e2}")
257
 
258
+ # Build BM25 index entries
259
  tokens = _tokenize(chunk)
260
  tf: Dict[str, int] = {}
261
  for tkn in tokens:
 
395
  "ids": ids,
396
  }
397
 
398
+ # ---------------------------- Hybrid search (improved) ----------------------------
399
  ACTION_SYNONYMS = {
400
  "create": ["create", "creation", "add", "new", "generate"],
401
  "update": ["update", "modify", "change", "edit"],
 
406
  "error", "issue", "fail", "not working", "resolution", "fix",
407
  "permission", "permissions", "access", "no access", "authorization", "authorisation",
408
  "role", "role mapping", "not authorized", "permission denied", "insufficient privileges",
409
+ "escalation", "escalation path", "access right", "mismatch", "locked", "wrong"
410
  ]
411
 
412
  def _detect_user_intent(query: str) -> str:
 
487
  return 0.75
488
  return -0.2
489
 
 
 
 
 
 
 
490
  def _meta_overlap(meta: Dict[str, Any], q_terms: List[str]) -> float:
491
  fn_tokens = _tokenize_meta_value(meta.get("filename"))
492
  title_tokens = _tokenize_meta_value(meta.get("title"))
 
518
  score += 0.60
519
  return min(score, 1.5)
520
 
521
+ def _literal_query_match_boost(text: str, query_norm: str) -> float:
522
+ """
523
+ Extra boost if the exact normalized query substring (or key tokens) appear in the chunk.
524
+ Helps errors like 'item mismatch' pick the right KB line.
525
+ """
526
+ t = (text or "").lower()
527
+ q = (query_norm or "").lower()
528
+ boost = 0.0
529
+ if q and q in t:
530
+ boost += 0.6
531
+ # Also check key 2-word error tokens present in query (e.g., 'item mismatch')
532
+ toks = [tok for tok in q.split() if len(tok) > 2]
533
+ bigrams = _make_ngrams(toks, 2)
534
+ for bg in bigrams:
535
+ if bg in t:
536
+ boost += 0.6
537
+ break
538
+ return min(boost, 1.2)
539
+
540
  def hybrid_search_knowledge_base(query: str, top_k: int = 10, alpha: float = 0.6, beta: float = 0.4) -> dict:
541
  norm_query = _normalize_query(query)
542
  q_terms = _tokenize(norm_query)
543
  user_intent = _detect_user_intent(query)
544
+ actions = _extract_actions(query)
545
+ user_modules = _extract_modules_from_query(query)
 
 
 
 
 
 
 
 
546
 
547
  sem_res = search_knowledge_base(norm_query, top_k=max(top_k, 30))
548
  sem_docs = sem_res.get("documents", [])
 
578
  zeta = 0.65 # module weight
579
  eta = 0.45 # phrase-level boost
580
  theta = 0.40 # heading alignment bonus
581
+ iota = 0.40 # literal query match boost
582
 
583
+ combined_records_ext: List[Tuple[str, float, float, str, Dict[str, Any], float, float, float, float, float, float, float]] = []
584
  for cid in union_ids:
585
  if cid in sem_ids:
586
  pos = sem_ids.index(cid)
 
603
  act_wt = _action_weight(text, actions)
604
  mod_wt = _module_weight(meta, user_modules)
605
  phrase_wt = _phrase_boost_score(text, q_terms)
606
+ literal_wt = _literal_query_match_boost(text, norm_query)
607
 
608
  sec_low = ((meta or {}).get("section", "") or "").lower()
609
  title_low = ((meta or {}).get("title", "") or "").lower()
 
624
  + zeta * mod_wt
625
  + eta * phrase_wt
626
  + theta * heading_bonus
627
+ + iota * literal_wt
628
  )
629
 
630
  combined_records_ext.append(
631
+ (cid, final_score, (sem_dist if sem_dist is not None else 999.0), text, meta, m_overlap, intent_boost, act_wt, mod_wt, phrase_wt, heading_bonus, literal_wt)
632
  )
633
 
634
  from collections import defaultdict
635
+ doc_groups: Dict[str, List[Tuple[str, float, float, str, Dict[str, Any], float, float, float, float, float, float, float]]] = defaultdict(list)
636
  for rec in combined_records_ext:
637
  meta = rec[4] or {}
638
  fn = meta.get("filename", "unknown")
639
  doc_groups[fn].append(rec)
640
 
641
+ def doc_prior(recs: List[Tuple[str, float, float, str, Dict[str, Any], float, float, float, float, float, float, float]]) -> float:
642
  total_score = sum(r[1] for r in recs)
643
  total_overlap = sum(r[5] for r in recs)
644
  total_intent = sum(max(0.0, r[6]) for r in recs)
 
646
  total_module = sum(r[8] for r in recs)
647
  total_phrase = sum(r[9] for r in recs)
648
  total_heading = sum(r[10] for r in recs)
649
+ total_literal = sum(r[11] for r in recs)
650
  total_penalty = sum(min(0.0, r[6]) for r in recs) + sum(min(0.0, r[7]) for r in recs)
651
  esc_weight = 0.3 if any("escalation" in ((r[4] or {}).get("section", "")).lower() for r in recs) else 0.0
652
  perm_weight = 0.3 if any("permissions" in (((r[4] or {}).get("topic_tags") or [])) for r in recs) else 0.0
 
658
  + 0.8 * total_module
659
  + 0.6 * total_phrase
660
  + 0.6 * total_heading
661
+ + 0.6 * total_literal
662
  + 0.3 * total_penalty
663
  + esc_weight + perm_weight
664
  )
 
670
  best_doc_prior, best_doc = p, fn
671
 
672
  best_recs = sorted(doc_groups.get(best_doc, []), key=lambda x: x[1], reverse=True)
673
+ other_recs: List[Tuple[str, float, float, str, Dict[str, Any], float, float, float, float, float, float, float]] = []
674
  for fn, recs in doc_groups.items():
675
  if fn == best_doc:
676
  continue