srilakshu012456 commited on
Commit
60c3916
·
verified ·
1 Parent(s): f89383d

Update services/kb_creation.py

Browse files
Files changed (1) hide show
  1. services/kb_creation.py +73 -22
services/kb_creation.py CHANGED
@@ -6,7 +6,7 @@ from typing import List, Dict, Any, Tuple, Optional
6
  from docx import Document
7
  from sentence_transformers import SentenceTransformer
8
  import chromadb
9
- #updated
10
  # --------------------------- ChromaDB setup ---------------------------
11
  CHROMA_PATH = os.path.join(os.getcwd(), "chroma_db")
12
  client = chromadb.PersistentClient(path=CHROMA_PATH)
@@ -112,6 +112,22 @@ def _chunk_text_with_context(doc_title: str, section_title: str, paragraphs: Lis
112
  chunks = [body]
113
  return chunks
114
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
115
  # --------------------------- Ingestion ---------------------------
116
  def ingest_documents(folder_path: str) -> None:
117
  """
@@ -143,11 +159,19 @@ def ingest_documents(folder_path: str) -> None:
143
  for s_idx, (section_title, paras) in enumerate(sections):
144
  chunks = _chunk_text_with_context(doc_title, section_title, paras, max_words=900)
145
  total_chunks += len(chunks)
 
146
  for c_idx, chunk in enumerate(chunks):
147
  # Embedding & Chroma
148
  embedding = model.encode(chunk).tolist()
149
  doc_id = f"{file}:{s_idx}:{c_idx}" # stable unique id
150
- meta = {"filename": file, "section": section_title, "chunk_index": c_idx, "title": doc_title, "collection": "SOP"}
 
 
 
 
 
 
 
151
  try:
152
  collection.add(
153
  ids=[doc_id],
@@ -220,7 +244,6 @@ def _load_bm25_index() -> None:
220
  bm25_inverted = payload.get("bm25_inverted", {})
221
  bm25_df = payload.get("bm25_df", {})
222
  bm25_avgdl = payload.get("bm25_avgdl", 0.0)
223
- # params retained but we keep module-level constants
224
  bm25_ready = len(bm25_docs) > 0
225
  if bm25_ready:
226
  print(f"✅ BM25 index loaded: {BM25_INDEX_FILE} (docs={len(bm25_docs)})")
@@ -249,7 +272,7 @@ def _bm25_score_for_doc(query_terms: List[str], doc_idx: int) -> float:
249
  continue
250
  # BM25 idf
251
  N = len(bm25_docs)
252
- idf_ratio = ( (N - df + 0.5) / (df + 0.5) )
253
  try:
254
  import math
255
  idf = math.log(idf_ratio + 1.0)
@@ -334,7 +357,7 @@ def search_knowledge_base(query: str, top_k: int = 10) -> dict:
334
  # --------------------------- Hybrid (BM25 + Embeddings) ---------------------------
335
  def _meta_overlap(meta: Dict[str, Any], q_terms: List[str]) -> float:
336
  """
337
- Automatic metadata overlap score (no manual module list).
338
  Uses filename, title, and section tokens. Range ~0..1.
339
  """
340
  if not meta:
@@ -349,21 +372,45 @@ def _meta_overlap(meta: Dict[str, Any], q_terms: List[str]) -> float:
349
  inter = len(meta_tokens & qset)
350
  return inter / max(1, len(qset))
351
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
352
  def hybrid_search_knowledge_base(query: str, top_k: int = 10, alpha: float = 0.6, beta: float = 0.4) -> dict:
353
  """
354
  Hybrid retrieval:
355
  - Semantic (Chroma/embeddings) → distances (lower = better) → convert to similarity
356
  - BM25 keyword → score (higher = better)
357
  - Re-rank union of candidates by:
358
- final = alpha * semantic_sim + beta * bm25_norm + gamma * meta_overlap
359
  - Document-level voting prior: aggregate scores by 'filename' and prefer the best document first.
360
  Returns a dict compatible with the extractor and includes:
361
  - 'ids': list[str]
362
- - 'combined_scores': list[float] (0..1ish)
 
363
  """
364
  # 1) Normalize query (language-agnostic)
365
  norm_query = _normalize_query(query)
366
  q_terms = _tokenize(norm_query)
 
367
 
368
  # 2) Semantic candidates (Chroma)
369
  sem_res = search_knowledge_base(norm_query, top_k=max(top_k, 30))
@@ -372,12 +419,12 @@ def hybrid_search_knowledge_base(query: str, top_k: int = 10, alpha: float = 0.6
372
  sem_dists = sem_res.get("distances", [])
373
  sem_ids = sem_res.get("ids", [])
374
 
375
- # Convert distances to 0..1 similarity (simple monotonic mapping)
376
  def dist_to_sim(d: Optional[float]) -> float:
377
  if d is None:
378
  return 0.0
379
  try:
380
- return 1.0 / (1.0 + float(d)) # lower distance -> higher sim
381
  except Exception:
382
  return 0.0
383
 
@@ -385,11 +432,10 @@ def hybrid_search_knowledge_base(query: str, top_k: int = 10, alpha: float = 0.6
385
 
386
  # 3) BM25 candidates
387
  bm25_hits = bm25_search(norm_query, top_k=max(50, top_k * 5))
388
- # normalize BM25 scores to 0..1
389
  bm25_max = max([s for _, s in bm25_hits], default=1.0)
390
  bm25_norm_pairs = [(idx, (score / bm25_max) if bm25_max > 0 else 0.0) for idx, score in bm25_hits]
391
 
392
- # 4) Merge candidates by doc_id
393
  bm25_id_to_norm: Dict[str, float] = {}
394
  bm25_id_to_text: Dict[str, str] = {}
395
  bm25_id_to_meta: Dict[str, Dict[str, Any]] = {}
@@ -400,11 +446,13 @@ def hybrid_search_knowledge_base(query: str, top_k: int = 10, alpha: float = 0.6
400
  bm25_id_to_text[d["id"]] = d["text"]
401
  bm25_id_to_meta[d["id"]] = d["meta"]
402
 
 
403
  union_ids = set(sem_ids) | set(bm25_id_to_norm.keys())
404
 
405
- gamma = 0.25 # metadata boost weight (tunable)
 
406
 
407
- combined_records_ext: List[Tuple[str, float, float, str, Dict[str, Any], float]] = [] # include meta_overlap
408
 
409
  for cid in union_ids:
410
  # semantic part
@@ -426,30 +474,32 @@ def hybrid_search_knowledge_base(query: str, top_k: int = 10, alpha: float = 0.6
426
  text = sem_text if sem_text else bm25_text
427
  meta = sem_meta if sem_meta else bm25_meta
428
 
429
- # NEW: automatic metadata overlap (no manual lists)
430
  m_overlap = _meta_overlap(meta, q_terms)
 
431
 
432
  # final combined score
433
- final_score = alpha * sem_sim + beta * bm25_sim + gamma * m_overlap
434
 
435
  combined_records_ext.append(
436
- (cid, final_score, (sem_dist if sem_dist is not None else 999.0), text, meta, m_overlap)
437
  )
438
 
439
  # ---------------- Document-level voting prior ----------------
440
- # Group by filename and compute aggregate doc score → prefer best doc first
441
  from collections import defaultdict
442
- doc_groups: Dict[str, List[Tuple[str, float, float, str, Dict[str, Any], float]]] = defaultdict(list)
443
  for rec in combined_records_ext:
444
  meta = rec[4] or {}
445
  fn = meta.get("filename", "unknown")
446
  doc_groups[fn].append(rec)
447
 
448
- # Compute doc_prior = sum(final_score) + small bonus for metadata overlap sum
449
- def doc_prior(recs: List[Tuple[str, float, float, str, Dict[str, Any], float]]) -> float:
450
  total_score = sum(r[1] for r in recs)
451
- total_meta = sum(r[5] for r in recs)
452
- return total_score + 0.4 * total_meta # 0.4 is tunable
 
 
453
 
454
  # Pick best document
455
  best_doc = None
@@ -486,4 +536,5 @@ def hybrid_search_knowledge_base(query: str, top_k: int = 10, alpha: float = 0.6
486
  "combined_scores": combined_scores,
487
  "best_doc": best_doc, # helpful for debugging
488
  "best_doc_prior": best_doc_prior, # helpful for debugging
 
489
  }
 
6
  from docx import Document
7
  from sentence_transformers import SentenceTransformer
8
  import chromadb
9
+
10
  # --------------------------- ChromaDB setup ---------------------------
11
  CHROMA_PATH = os.path.join(os.getcwd(), "chroma_db")
12
  client = chromadb.PersistentClient(path=CHROMA_PATH)
 
112
  chunks = [body]
113
  return chunks
114
 
115
+ # --------------------------- Intent tagging (auto) ---------------------------
116
+ def _infer_intent_tag(section_title: str) -> str:
117
+ """
118
+ Infer coarse intent from section title—no manual curation.
119
+ """
120
+ st = (section_title or "").lower()
121
+ if any(k in st for k in ["process steps", "procedure", "how to", "workflow", "instructions"]):
122
+ return "steps"
123
+ if any(k in st for k in ["common errors", "resolution", "troubleshooting"]):
124
+ return "errors"
125
+ if any(k in st for k in ["pre-requisites", "prerequisites"]):
126
+ return "prereqs"
127
+ if any(k in st for k in ["purpose", "overview", "introduction"]):
128
+ return "purpose"
129
+ return "neutral"
130
+
131
  # --------------------------- Ingestion ---------------------------
132
  def ingest_documents(folder_path: str) -> None:
133
  """
 
159
  for s_idx, (section_title, paras) in enumerate(sections):
160
  chunks = _chunk_text_with_context(doc_title, section_title, paras, max_words=900)
161
  total_chunks += len(chunks)
162
+ intent_tag = _infer_intent_tag(section_title)
163
  for c_idx, chunk in enumerate(chunks):
164
  # Embedding & Chroma
165
  embedding = model.encode(chunk).tolist()
166
  doc_id = f"{file}:{s_idx}:{c_idx}" # stable unique id
167
+ meta = {
168
+ "filename": file,
169
+ "section": section_title,
170
+ "chunk_index": c_idx,
171
+ "title": doc_title,
172
+ "collection": "SOP",
173
+ "intent_tag": intent_tag, # NEW
174
+ }
175
  try:
176
  collection.add(
177
  ids=[doc_id],
 
244
  bm25_inverted = payload.get("bm25_inverted", {})
245
  bm25_df = payload.get("bm25_df", {})
246
  bm25_avgdl = payload.get("bm25_avgdl", 0.0)
 
247
  bm25_ready = len(bm25_docs) > 0
248
  if bm25_ready:
249
  print(f"✅ BM25 index loaded: {BM25_INDEX_FILE} (docs={len(bm25_docs)})")
 
272
  continue
273
  # BM25 idf
274
  N = len(bm25_docs)
275
+ idf_ratio = ((N - df + 0.5) / (df + 0.5))
276
  try:
277
  import math
278
  idf = math.log(idf_ratio + 1.0)
 
357
  # --------------------------- Hybrid (BM25 + Embeddings) ---------------------------
358
  def _meta_overlap(meta: Dict[str, Any], q_terms: List[str]) -> float:
359
  """
360
+ Automatic metadata overlap score (no manual per-SOP lists).
361
  Uses filename, title, and section tokens. Range ~0..1.
362
  """
363
  if not meta:
 
372
  inter = len(meta_tokens & qset)
373
  return inter / max(1, len(qset))
374
 
375
+ def _detect_user_intent(query: str) -> str:
376
+ q = (query or "").lower()
377
+ if any(k in q for k in ["steps", "procedure", "how to", "navigate", "perform", "do", "process"]):
378
+ return "steps"
379
+ if any(k in q for k in ["error", "issue", "fail", "not working", "resolution", "fix"]):
380
+ return "errors"
381
+ if any(k in q for k in ["pre-requisite", "prerequisites", "requirement", "requirements"]):
382
+ return "prereqs"
383
+ if any(k in q for k in ["purpose", "overview", "introduction"]):
384
+ return "purpose"
385
+ return "neutral"
386
+
387
+ def _intent_weight(meta: dict, user_intent: str) -> float:
388
+ tag = (meta or {}).get("intent_tag", "neutral")
389
+ if user_intent == "neutral":
390
+ return 0.0
391
+ if tag == user_intent:
392
+ return 1.0 # strong boost when intent matches
393
+ if tag in ["purpose", "prereqs"] and user_intent in ["steps", "errors"]:
394
+ return -0.6 # penalize overview/prereqs for steps/errors queries
395
+ return -0.2 # small penalty for other mismatches
396
+
397
  def hybrid_search_knowledge_base(query: str, top_k: int = 10, alpha: float = 0.6, beta: float = 0.4) -> dict:
398
  """
399
  Hybrid retrieval:
400
  - Semantic (Chroma/embeddings) → distances (lower = better) → convert to similarity
401
  - BM25 keyword → score (higher = better)
402
  - Re-rank union of candidates by:
403
+ final = alpha * semantic_sim + beta * bm25_norm + gamma * meta_overlap + delta * intent_boost
404
  - Document-level voting prior: aggregate scores by 'filename' and prefer the best document first.
405
  Returns a dict compatible with the extractor and includes:
406
  - 'ids': list[str]
407
+ - 'combined_scores': list[float]
408
+ - 'best_doc', 'best_doc_prior', 'user_intent'
409
  """
410
  # 1) Normalize query (language-agnostic)
411
  norm_query = _normalize_query(query)
412
  q_terms = _tokenize(norm_query)
413
+ user_intent = _detect_user_intent(query)
414
 
415
  # 2) Semantic candidates (Chroma)
416
  sem_res = search_knowledge_base(norm_query, top_k=max(top_k, 30))
 
419
  sem_dists = sem_res.get("distances", [])
420
  sem_ids = sem_res.get("ids", [])
421
 
422
+ # Convert distances to 0..1 similarity
423
  def dist_to_sim(d: Optional[float]) -> float:
424
  if d is None:
425
  return 0.0
426
  try:
427
+ return 1.0 / (1.0 + float(d))
428
  except Exception:
429
  return 0.0
430
 
 
432
 
433
  # 3) BM25 candidates
434
  bm25_hits = bm25_search(norm_query, top_k=max(50, top_k * 5))
 
435
  bm25_max = max([s for _, s in bm25_hits], default=1.0)
436
  bm25_norm_pairs = [(idx, (score / bm25_max) if bm25_max > 0 else 0.0) for idx, score in bm25_hits]
437
 
438
+ # 4) Prepare BM25 maps
439
  bm25_id_to_norm: Dict[str, float] = {}
440
  bm25_id_to_text: Dict[str, str] = {}
441
  bm25_id_to_meta: Dict[str, Dict[str, Any]] = {}
 
446
  bm25_id_to_text[d["id"]] = d["text"]
447
  bm25_id_to_meta[d["id"]] = d["meta"]
448
 
449
+ # 5) Union of candidates
450
  union_ids = set(sem_ids) | set(bm25_id_to_norm.keys())
451
 
452
+ gamma = 0.25 # metadata overlap weight
453
+ delta = 0.35 # intent-aware weight
454
 
455
+ combined_records_ext: List[Tuple[str, float, float, str, Dict[str, Any], float, float]] = [] # include overlap+intent
456
 
457
  for cid in union_ids:
458
  # semantic part
 
474
  text = sem_text if sem_text else bm25_text
475
  meta = sem_meta if sem_meta else bm25_meta
476
 
477
+ # NEW: automatic metadata overlap + intent-aware boost
478
  m_overlap = _meta_overlap(meta, q_terms)
479
+ intent_boost = _intent_weight(meta, user_intent)
480
 
481
  # final combined score
482
+ final_score = alpha * sem_sim + beta * bm25_sim + gamma * m_overlap + delta * intent_boost
483
 
484
  combined_records_ext.append(
485
+ (cid, final_score, (sem_dist if sem_dist is not None else 999.0), text, meta, m_overlap, intent_boost)
486
  )
487
 
488
  # ---------------- Document-level voting prior ----------------
 
489
  from collections import defaultdict
490
+ doc_groups: Dict[str, List[Tuple[str, float, float, str, Dict[str, Any], float, float]]] = defaultdict(list)
491
  for rec in combined_records_ext:
492
  meta = rec[4] or {}
493
  fn = meta.get("filename", "unknown")
494
  doc_groups[fn].append(rec)
495
 
496
+ # Compute doc_prior = sum(final_score) + bonuses for overlap+intent
497
+ def doc_prior(recs: List[Tuple[str, float, float, str, Dict[str, Any], float, float]]) -> float:
498
  total_score = sum(r[1] for r in recs)
499
+ total_overlap = sum(r[5] for r in recs)
500
+ total_intent = sum(max(0.0, r[6]) for r in recs) # positive intent boosts
501
+ total_penalty = sum(min(0.0, r[6]) for r in recs) # penalties
502
+ return total_score + 0.4 * total_overlap + 0.6 * total_intent + 0.3 * total_penalty
503
 
504
  # Pick best document
505
  best_doc = None
 
536
  "combined_scores": combined_scores,
537
  "best_doc": best_doc, # helpful for debugging
538
  "best_doc_prior": best_doc_prior, # helpful for debugging
539
+ "user_intent": user_intent, # helpful for debugging
540
  }