srilakshu012456 commited on
Commit
82c195c
·
verified ·
1 Parent(s): 4c40701

Update services/kb_creation.py

Browse files
Files changed (1) hide show
  1. services/kb_creation.py +90 -159
services/kb_creation.py CHANGED
@@ -13,10 +13,7 @@ client = chromadb.PersistentClient(path=CHROMA_PATH)
13
  collection = client.get_or_create_collection(name="knowledge_base")
14
 
15
  # --------------------------- Embedding model ---------------------------
16
- # You can swap to a multilingual model if you expect mixed language queries:
17
- # model = SentenceTransformer('paraphrase-multilingual-MiniLM-L12-v2')
18
- # MODEL_PATH = './models/all-MiniLM-L6-v2'
19
- # model = SentenceTransformer(MODEL_PATH)
20
  model = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2")
21
 
22
  # --------------------------- BM25 (lightweight) ---------------------------
@@ -31,24 +28,14 @@ BM25_B = 0.75
31
 
32
  # --------------------------- Utilities ---------------------------
33
  def _tokenize(text: str) -> List[str]:
34
- """
35
- Simple tokenizer: lowercase alphanumeric words; removes most punctuation.
36
- Keeps stopwords (BM25 can work with them), but normalizes whitespace.
37
- """
38
  if not text:
39
  return []
40
  text = text.lower()
41
- tokens = re.findall(r"[a-z0-9]+", text)
42
- return tokens
43
 
44
  def _normalize_query(q: str) -> str:
45
- """
46
- Language-agnostic normalization for user queries (no hardcoded domain synonyms).
47
- Removes filler verbs, collapses whitespace, lowercases, keeps key terms.
48
- """
49
  q = (q or "").strip().lower()
50
  q = re.sub(r"[^\w\s]", " ", q)
51
- # remove generic filler verbs/common noise words across English variants
52
  q = re.sub(
53
  r"\b(facing|get|getting|got|seeing|receiving|encountered|having|observing|issue|problem)\b",
54
  " ",
@@ -58,17 +45,10 @@ def _normalize_query(q: str) -> str:
58
  return q
59
 
60
  def _tokenize_meta_value(val: Optional[str]) -> List[str]:
61
- if not val:
62
- return []
63
- return _tokenize(val)
64
 
65
  # --------------------------- DOCX parsing & chunking ---------------------------
66
  def _split_by_sections(doc: Document) -> List[Tuple[str, List[str]]]:
67
- """
68
- Split DOCX into (section_title, paragraphs_in_section).
69
- Uses paragraph style names: 'Heading 1', 'Heading 2', etc.
70
- Falls back to document-level when no headings are present.
71
- """
72
  sections: List[Tuple[str, List[str]]] = []
73
  current_title = None
74
  current_paras: List[str] = []
@@ -77,7 +57,6 @@ def _split_by_sections(doc: Document) -> List[Tuple[str, List[str]]]:
77
  style_name = (para.style.name if para.style else "") or ""
78
  is_heading = bool(re.match(r"Heading\s*\d+", style_name, flags=re.IGNORECASE))
79
  if is_heading and text:
80
- # commit previous section
81
  if current_title or current_paras:
82
  sections.append((current_title or "Untitled Section", current_paras))
83
  current_title = text
@@ -85,20 +64,14 @@ def _split_by_sections(doc: Document) -> List[Tuple[str, List[str]]]:
85
  else:
86
  if text:
87
  current_paras.append(text)
88
- # final section
89
  if current_title or current_paras:
90
  sections.append((current_title or "Untitled Section", current_paras))
91
- # in case no headings at all, make one pseudo-section with all text
92
  if not sections:
93
  all_text = [p.text.strip() for p in doc.paragraphs if p.text and p.text.strip()]
94
  sections = [("Document", all_text)]
95
  return sections
96
 
97
  def _chunk_text_with_context(doc_title: str, section_title: str, paragraphs: List[str], max_words: int = 900) -> List[str]:
98
- """
99
- Build chunks from paragraphs ONLY (no doc/section headers in the text).
100
- We still keep title/section inside metadata so retrieval quality remains high.
101
- """
102
  body = "\n".join(paragraphs).strip()
103
  if not body:
104
  return []
@@ -107,16 +80,13 @@ def _chunk_text_with_context(doc_title: str, section_title: str, paragraphs: Lis
107
  for i in range(0, len(words), max_words):
108
  chunk_body = ' '.join(words[i:i + max_words]).strip()
109
  if chunk_body:
110
- chunks.append(chunk_body) # <-- no headers inside the chunk content
111
  if not chunks:
112
  chunks = [body]
113
  return chunks
114
 
115
  # --------------------------- Intent tagging (auto) ---------------------------
116
  def _infer_intent_tag(section_title: str) -> str:
117
- """
118
- Infer coarse intent from section title—no manual curation.
119
- """
120
  st = (section_title or "").lower()
121
  if any(k in st for k in ["process steps", "procedure", "how to", "workflow", "instructions"]):
122
  return "steps"
@@ -130,10 +100,6 @@ def _infer_intent_tag(section_title: str) -> str:
130
 
131
  # --------------------------- Ingestion ---------------------------
132
  def ingest_documents(folder_path: str) -> None:
133
- """
134
- Read .docx files, section-aware chunking, generate embeddings, store in ChromaDB,
135
- and build BM25 inverted index with persistence.
136
- """
137
  print(f"📂 Checking folder: {folder_path}")
138
  files = [f for f in os.listdir(folder_path) if f.lower().endswith('.docx')]
139
  print(f"Found {len(files)} Word files: {files}")
@@ -141,13 +107,9 @@ def ingest_documents(folder_path: str) -> None:
141
  print("⚠️ No .docx files found. Please check the folder path.")
142
  return
143
 
144
- # Reset BM25 memory structures
145
  global bm25_docs, bm25_inverted, bm25_df, bm25_avgdl, bm25_ready
146
- bm25_docs = []
147
- bm25_inverted = {}
148
- bm25_df = {}
149
- bm25_avgdl = 0.0
150
- bm25_ready = False
151
 
152
  for file in files:
153
  file_path = os.path.join(folder_path, file)
@@ -161,9 +123,8 @@ def ingest_documents(folder_path: str) -> None:
161
  total_chunks += len(chunks)
162
  intent_tag = _infer_intent_tag(section_title)
163
  for c_idx, chunk in enumerate(chunks):
164
- # Embedding & Chroma
165
  embedding = model.encode(chunk).tolist()
166
- doc_id = f"{file}:{s_idx}:{c_idx}" # stable unique id
167
  meta = {
168
  "filename": file,
169
  "section": section_title,
@@ -173,49 +134,35 @@ def ingest_documents(folder_path: str) -> None:
173
  "intent_tag": intent_tag, # NEW
174
  }
175
  try:
176
- collection.add(
177
- ids=[doc_id],
178
- embeddings=[embedding],
179
- documents=[chunk],
180
- metadatas=[meta],
181
- )
182
  except Exception:
183
- # upsert on duplicate
184
  try:
185
  collection.delete(ids=[doc_id])
186
- collection.add(
187
- ids=[doc_id],
188
- embeddings=[embedding],
189
- documents=[chunk],
190
- metadatas=[meta],
191
- )
192
  except Exception as e2:
193
  print(f"❌ Upsert failed for {doc_id}: {e2}")
194
 
195
- # BM25 indexing
196
  tokens = _tokenize(chunk)
197
  tf: Dict[str, int] = {}
198
  for t in tokens:
199
  tf[t] = tf.get(t, 0) + 1
200
  idx = len(bm25_docs)
201
  bm25_docs.append({"id": doc_id, "text": chunk, "tokens": tokens, "tf": tf, "length": len(tokens), "meta": meta})
202
- # update inverted index & df
203
- seen_terms = set()
204
  for term in tf.keys():
205
  bm25_inverted.setdefault(term, []).append(idx)
206
- if term not in seen_terms:
207
  bm25_df[term] = bm25_df.get(term, 0) + 1
208
- seen_terms.add(term)
209
 
210
  print(f"📄 Ingested {file} → {total_chunks} chunks")
211
 
212
- # finalize BM25 stats
213
  N = len(bm25_docs)
214
  if N > 0:
215
  bm25_avgdl = sum(d["length"] for d in bm25_docs) / float(N)
216
  bm25_ready = True
217
 
218
- # persist BM25 index
219
  payload = {
220
  "bm25_docs": bm25_docs,
221
  "bm25_inverted": bm25_inverted,
@@ -231,9 +178,6 @@ def ingest_documents(folder_path: str) -> None:
231
  print(f"✅ Documents ingested. Total entries in Chroma: {collection.count()}")
232
 
233
  def _load_bm25_index() -> None:
234
- """
235
- Load persisted BM25 index if available.
236
- """
237
  global bm25_docs, bm25_inverted, bm25_df, bm25_avgdl, bm25_ready
238
  if not os.path.exists(BM25_INDEX_FILE):
239
  return
@@ -250,14 +194,10 @@ def _load_bm25_index() -> None:
250
  except Exception as e:
251
  print(f"⚠️ Could not load BM25 index: {e}")
252
 
253
- # auto-load on import
254
  _load_bm25_index()
255
 
256
  # --------------------------- BM25 search ---------------------------
257
  def _bm25_score_for_doc(query_terms: List[str], doc_idx: int) -> float:
258
- """
259
- Okapi BM25 score for a given doc.
260
- """
261
  if not bm25_ready or doc_idx < 0 or doc_idx >= len(bm25_docs):
262
  return 0.0
263
  doc = bm25_docs[doc_idx]
@@ -270,7 +210,6 @@ def _bm25_score_for_doc(query_terms: List[str], doc_idx: int) -> float:
270
  tf = doc["tf"].get(term, 0)
271
  if tf == 0:
272
  continue
273
- # BM25 idf
274
  N = len(bm25_docs)
275
  idf_ratio = ((N - df + 0.5) / (df + 0.5))
276
  try:
@@ -283,25 +222,18 @@ def _bm25_score_for_doc(query_terms: List[str], doc_idx: int) -> float:
283
  return score
284
 
285
  def bm25_search(query: str, top_k: int = 50) -> List[Tuple[int, float]]:
286
- """
287
- Returns a list of (doc_idx, score) sorted by score desc.
288
- """
289
  if not bm25_ready:
290
  return []
291
  norm = _normalize_query(query)
292
  q_terms = _tokenize(norm)
293
  if not q_terms:
294
  return []
295
-
296
- # collect candidate doc indices via inverted index
297
  candidates = set()
298
  for t in q_terms:
299
  for idx in bm25_inverted.get(t, []):
300
  candidates.add(idx)
301
  if not candidates:
302
- # fallback to brute force if no inverted match
303
  candidates = set(range(len(bm25_docs)))
304
-
305
  scored = []
306
  for idx in candidates:
307
  s = _bm25_score_for_doc(q_terms, idx)
@@ -310,32 +242,24 @@ def bm25_search(query: str, top_k: int = 50) -> List[Tuple[int, float]]:
310
  scored.sort(key=lambda x: x[1], reverse=True)
311
  return scored[:top_k]
312
 
313
- # --------------------------- Semantic-only (legacy) ---------------------------
314
  def search_knowledge_base(query: str, top_k: int = 10) -> dict:
315
- """
316
- Semantic-only search (Chroma). We DO NOT ask for 'ids' in include
317
- because some Chroma clients reject it; if 'ids' is present in the
318
- response we will use it, otherwise we synthesize stable IDs from metadata.
319
- """
320
  query_embedding = model.encode(query).tolist()
321
  res = collection.query(
322
  query_embeddings=[query_embedding],
323
  n_results=top_k,
324
- include=['documents', 'metadatas', 'distances'] # no 'ids' here
325
  )
326
-
327
- # Flatten lists-per-query
328
  docs_ll = res.get("documents", [[]]) or [[]]
329
  metas_ll = res.get("metadatas", [[]]) or [[]]
330
  dists_ll = res.get("distances", [[]]) or [[]]
331
- ids_ll = res.get("ids", [[]]) or [[]] # some clients still return 'ids' anyway
332
 
333
  documents = docs_ll[0] if docs_ll else []
334
  metadatas = metas_ll[0] if metas_ll else []
335
  distances = dists_ll[0] if dists_ll else []
336
  ids = ids_ll[0] if ids_ll else []
337
 
338
- # If 'ids' is missing, synthesize stable IDs from metadata
339
  if not ids and documents:
340
  synthesized = []
341
  for i, m in enumerate(metadatas):
@@ -354,23 +278,14 @@ def search_knowledge_base(query: str, top_k: int = 10) -> dict:
354
  "ids": ids,
355
  }
356
 
357
- # --------------------------- Hybrid (BM25 + Embeddings) ---------------------------
358
- def _meta_overlap(meta: Dict[str, Any], q_terms: List[str]) -> float:
359
- """
360
- Automatic metadata overlap score (no manual per-SOP lists).
361
- Uses filename, title, and section tokens. Range ~0..1.
362
- """
363
- if not meta:
364
- return 0.0
365
- fn_tokens = _tokenize_meta_value(meta.get("filename"))
366
- title_tokens = _tokenize_meta_value(meta.get("title"))
367
- section_tokens = _tokenize_meta_value(meta.get("section"))
368
- meta_tokens = set(fn_tokens + title_tokens + section_tokens)
369
- if not meta_tokens or not q_terms:
370
- return 0.0
371
- qset = set(q_terms)
372
- inter = len(meta_tokens & qset)
373
- return inter / max(1, len(qset))
374
 
375
  def _detect_user_intent(query: str) -> str:
376
  q = (query or "").lower()
@@ -384,42 +299,72 @@ def _detect_user_intent(query: str) -> str:
384
  return "purpose"
385
  return "neutral"
386
 
 
 
 
 
 
 
 
 
387
  def _intent_weight(meta: dict, user_intent: str) -> float:
388
  tag = (meta or {}).get("intent_tag", "neutral")
389
  if user_intent == "neutral":
390
  return 0.0
391
  if tag == user_intent:
392
- return 1.0 # strong boost when intent matches
393
  if tag in ["purpose", "prereqs"] and user_intent in ["steps", "errors"]:
394
- return -0.6 # penalize overview/prereqs for steps/errors queries
395
- return -0.2 # small penalty for other mismatches
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
396
 
397
  def hybrid_search_knowledge_base(query: str, top_k: int = 10, alpha: float = 0.6, beta: float = 0.4) -> dict:
398
  """
399
- Hybrid retrieval:
400
- - Semantic (Chroma/embeddings) → distances (lower = better) → convert to similarity
401
- - BM25 keyword → score (higher = better)
402
- - Re-rank union of candidates by:
403
- final = alpha * semantic_sim + beta * bm25_norm + gamma * meta_overlap + delta * intent_boost
404
- - Document-level voting prior: aggregate scores by 'filename' and prefer the best document first.
405
- Returns a dict compatible with the extractor and includes:
406
- - 'ids': list[str]
407
- - 'combined_scores': list[float]
408
- - 'best_doc', 'best_doc_prior', 'user_intent'
409
  """
410
- # 1) Normalize query (language-agnostic)
411
  norm_query = _normalize_query(query)
412
  q_terms = _tokenize(norm_query)
413
  user_intent = _detect_user_intent(query)
 
414
 
415
- # 2) Semantic candidates (Chroma)
416
  sem_res = search_knowledge_base(norm_query, top_k=max(top_k, 30))
417
  sem_docs = sem_res.get("documents", [])
418
  sem_metas = sem_res.get("metadatas", [])
419
  sem_dists = sem_res.get("distances", [])
420
  sem_ids = sem_res.get("ids", [])
421
 
422
- # Convert distances to 0..1 similarity
423
  def dist_to_sim(d: Optional[float]) -> float:
424
  if d is None:
425
  return 0.0
@@ -430,32 +375,25 @@ def hybrid_search_knowledge_base(query: str, top_k: int = 10, alpha: float = 0.6
430
 
431
  sem_sims = [dist_to_sim(d) for d in sem_dists]
432
 
433
- # 3) BM25 candidates
434
  bm25_hits = bm25_search(norm_query, top_k=max(50, top_k * 5))
435
  bm25_max = max([s for _, s in bm25_hits], default=1.0)
436
  bm25_norm_pairs = [(idx, (score / bm25_max) if bm25_max > 0 else 0.0) for idx, score in bm25_hits]
437
 
438
- # 4) Prepare BM25 maps
439
- bm25_id_to_norm: Dict[str, float] = {}
440
- bm25_id_to_text: Dict[str, str] = {}
441
- bm25_id_to_meta: Dict[str, Dict[str, Any]] = {}
442
-
443
  for idx, nscore in bm25_norm_pairs:
444
  d = bm25_docs[idx]
445
  bm25_id_to_norm[d["id"]] = nscore
446
  bm25_id_to_text[d["id"]] = d["text"]
447
  bm25_id_to_meta[d["id"]] = d["meta"]
448
 
449
- # 5) Union of candidates
450
  union_ids = set(sem_ids) | set(bm25_id_to_norm.keys())
451
 
452
- gamma = 0.25 # metadata overlap weight
453
- delta = 0.35 # intent-aware weight
454
-
455
- combined_records_ext: List[Tuple[str, float, float, str, Dict[str, Any], float, float]] = [] # include overlap+intent
456
 
 
457
  for cid in union_ids:
458
- # semantic part
459
  if cid in sem_ids:
460
  pos = sem_ids.index(cid)
461
  sem_sim = sem_sims[pos] if pos < len(sem_sims) else 0.0
@@ -465,52 +403,44 @@ def hybrid_search_knowledge_base(query: str, top_k: int = 10, alpha: float = 0.6
465
  else:
466
  sem_sim, sem_dist, sem_text, sem_meta = 0.0, None, "", {}
467
 
468
- # bm25 part
469
  bm25_sim = bm25_id_to_norm.get(cid, 0.0)
470
  bm25_text = bm25_id_to_text.get(cid, "")
471
  bm25_meta = bm25_id_to_meta.get(cid, {})
472
 
473
- # prefer non-empty text/meta
474
  text = sem_text if sem_text else bm25_text
475
  meta = sem_meta if sem_meta else bm25_meta
476
 
477
- # NEW: automatic metadata overlap + intent-aware boost
478
  m_overlap = _meta_overlap(meta, q_terms)
479
  intent_boost = _intent_weight(meta, user_intent)
 
480
 
481
- # final combined score
482
- final_score = alpha * sem_sim + beta * bm25_sim + gamma * m_overlap + delta * intent_boost
483
 
484
  combined_records_ext.append(
485
- (cid, final_score, (sem_dist if sem_dist is not None else 999.0), text, meta, m_overlap, intent_boost)
486
  )
487
 
488
- # ---------------- Document-level voting prior ----------------
489
  from collections import defaultdict
490
- doc_groups: Dict[str, List[Tuple[str, float, float, str, Dict[str, Any], float, float]]] = defaultdict(list)
491
  for rec in combined_records_ext:
492
  meta = rec[4] or {}
493
  fn = meta.get("filename", "unknown")
494
  doc_groups[fn].append(rec)
495
 
496
- # Compute doc_prior = sum(final_score) + bonuses for overlap+intent
497
- def doc_prior(recs: List[Tuple[str, float, float, str, Dict[str, Any], float, float]]) -> float:
498
  total_score = sum(r[1] for r in recs)
499
  total_overlap = sum(r[5] for r in recs)
500
- total_intent = sum(max(0.0, r[6]) for r in recs) # positive intent boosts
501
- total_penalty = sum(min(0.0, r[6]) for r in recs) # penalties
502
- return total_score + 0.4 * total_overlap + 0.6 * total_intent + 0.3 * total_penalty
 
503
 
504
- # Pick best document
505
- best_doc = None
506
- best_doc_prior = -1.0
507
  for fn, recs in doc_groups.items():
508
  p = doc_prior(recs)
509
  if p > best_doc_prior:
510
- best_doc_prior = p
511
- best_doc = fn
512
 
513
- # Reorder: take items from best_doc first (sorted by score), then others
514
  best_recs = sorted(doc_groups.get(best_doc, []), key=lambda x: x[1], reverse=True)
515
  other_recs = []
516
  for fn, recs in doc_groups.items():
@@ -534,7 +464,8 @@ def hybrid_search_knowledge_base(query: str, top_k: int = 10, alpha: float = 0.6
534
  "distances": distances,
535
  "ids": ids,
536
  "combined_scores": combined_scores,
537
- "best_doc": best_doc, # helpful for debugging
538
- "best_doc_prior": best_doc_prior, # helpful for debugging
539
- "user_intent": user_intent, # helpful for debugging
 
540
  }
 
13
  collection = client.get_or_create_collection(name="knowledge_base")
14
 
15
  # --------------------------- Embedding model ---------------------------
16
+ # model = SentenceTransformer('paraphrase-multilingual-MiniLM-L12-v2') # optional
 
 
 
17
  model = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2")
18
 
19
  # --------------------------- BM25 (lightweight) ---------------------------
 
28
 
29
  # --------------------------- Utilities ---------------------------
30
  def _tokenize(text: str) -> List[str]:
 
 
 
 
31
  if not text:
32
  return []
33
  text = text.lower()
34
+ return re.findall(r"[a-z0-9]+", text)
 
35
 
36
  def _normalize_query(q: str) -> str:
 
 
 
 
37
  q = (q or "").strip().lower()
38
  q = re.sub(r"[^\w\s]", " ", q)
 
39
  q = re.sub(
40
  r"\b(facing|get|getting|got|seeing|receiving|encountered|having|observing|issue|problem)\b",
41
  " ",
 
45
  return q
46
 
47
  def _tokenize_meta_value(val: Optional[str]) -> List[str]:
48
+ return _tokenize(val or "")
 
 
49
 
50
  # --------------------------- DOCX parsing & chunking ---------------------------
51
  def _split_by_sections(doc: Document) -> List[Tuple[str, List[str]]]:
 
 
 
 
 
52
  sections: List[Tuple[str, List[str]]] = []
53
  current_title = None
54
  current_paras: List[str] = []
 
57
  style_name = (para.style.name if para.style else "") or ""
58
  is_heading = bool(re.match(r"Heading\s*\d+", style_name, flags=re.IGNORECASE))
59
  if is_heading and text:
 
60
  if current_title or current_paras:
61
  sections.append((current_title or "Untitled Section", current_paras))
62
  current_title = text
 
64
  else:
65
  if text:
66
  current_paras.append(text)
 
67
  if current_title or current_paras:
68
  sections.append((current_title or "Untitled Section", current_paras))
 
69
  if not sections:
70
  all_text = [p.text.strip() for p in doc.paragraphs if p.text and p.text.strip()]
71
  sections = [("Document", all_text)]
72
  return sections
73
 
74
  def _chunk_text_with_context(doc_title: str, section_title: str, paragraphs: List[str], max_words: int = 900) -> List[str]:
 
 
 
 
75
  body = "\n".join(paragraphs).strip()
76
  if not body:
77
  return []
 
80
  for i in range(0, len(words), max_words):
81
  chunk_body = ' '.join(words[i:i + max_words]).strip()
82
  if chunk_body:
83
+ chunks.append(chunk_body) # no doc/section headers inside text
84
  if not chunks:
85
  chunks = [body]
86
  return chunks
87
 
88
  # --------------------------- Intent tagging (auto) ---------------------------
89
  def _infer_intent_tag(section_title: str) -> str:
 
 
 
90
  st = (section_title or "").lower()
91
  if any(k in st for k in ["process steps", "procedure", "how to", "workflow", "instructions"]):
92
  return "steps"
 
100
 
101
  # --------------------------- Ingestion ---------------------------
102
  def ingest_documents(folder_path: str) -> None:
 
 
 
 
103
  print(f"📂 Checking folder: {folder_path}")
104
  files = [f for f in os.listdir(folder_path) if f.lower().endswith('.docx')]
105
  print(f"Found {len(files)} Word files: {files}")
 
107
  print("⚠️ No .docx files found. Please check the folder path.")
108
  return
109
 
 
110
  global bm25_docs, bm25_inverted, bm25_df, bm25_avgdl, bm25_ready
111
+ bm25_docs, bm25_inverted, bm25_df = [], {}, {}
112
+ bm25_avgdl, bm25_ready = 0.0, False
 
 
 
113
 
114
  for file in files:
115
  file_path = os.path.join(folder_path, file)
 
123
  total_chunks += len(chunks)
124
  intent_tag = _infer_intent_tag(section_title)
125
  for c_idx, chunk in enumerate(chunks):
 
126
  embedding = model.encode(chunk).tolist()
127
+ doc_id = f"{file}:{s_idx}:{c_idx}"
128
  meta = {
129
  "filename": file,
130
  "section": section_title,
 
134
  "intent_tag": intent_tag, # NEW
135
  }
136
  try:
137
+ collection.add(ids=[doc_id], embeddings=[embedding], documents=[chunk], metadatas=[meta])
 
 
 
 
 
138
  except Exception:
 
139
  try:
140
  collection.delete(ids=[doc_id])
141
+ collection.add(ids=[doc_id], embeddings=[embedding], documents=[chunk], metadatas=[meta])
 
 
 
 
 
142
  except Exception as e2:
143
  print(f"❌ Upsert failed for {doc_id}: {e2}")
144
 
 
145
  tokens = _tokenize(chunk)
146
  tf: Dict[str, int] = {}
147
  for t in tokens:
148
  tf[t] = tf.get(t, 0) + 1
149
  idx = len(bm25_docs)
150
  bm25_docs.append({"id": doc_id, "text": chunk, "tokens": tokens, "tf": tf, "length": len(tokens), "meta": meta})
151
+
152
+ seen = set()
153
  for term in tf.keys():
154
  bm25_inverted.setdefault(term, []).append(idx)
155
+ if term not in seen:
156
  bm25_df[term] = bm25_df.get(term, 0) + 1
157
+ seen.add(term)
158
 
159
  print(f"📄 Ingested {file} → {total_chunks} chunks")
160
 
 
161
  N = len(bm25_docs)
162
  if N > 0:
163
  bm25_avgdl = sum(d["length"] for d in bm25_docs) / float(N)
164
  bm25_ready = True
165
 
 
166
  payload = {
167
  "bm25_docs": bm25_docs,
168
  "bm25_inverted": bm25_inverted,
 
178
  print(f"✅ Documents ingested. Total entries in Chroma: {collection.count()}")
179
 
180
  def _load_bm25_index() -> None:
 
 
 
181
  global bm25_docs, bm25_inverted, bm25_df, bm25_avgdl, bm25_ready
182
  if not os.path.exists(BM25_INDEX_FILE):
183
  return
 
194
  except Exception as e:
195
  print(f"⚠️ Could not load BM25 index: {e}")
196
 
 
197
  _load_bm25_index()
198
 
199
  # --------------------------- BM25 search ---------------------------
200
  def _bm25_score_for_doc(query_terms: List[str], doc_idx: int) -> float:
 
 
 
201
  if not bm25_ready or doc_idx < 0 or doc_idx >= len(bm25_docs):
202
  return 0.0
203
  doc = bm25_docs[doc_idx]
 
210
  tf = doc["tf"].get(term, 0)
211
  if tf == 0:
212
  continue
 
213
  N = len(bm25_docs)
214
  idf_ratio = ((N - df + 0.5) / (df + 0.5))
215
  try:
 
222
  return score
223
 
224
  def bm25_search(query: str, top_k: int = 50) -> List[Tuple[int, float]]:
 
 
 
225
  if not bm25_ready:
226
  return []
227
  norm = _normalize_query(query)
228
  q_terms = _tokenize(norm)
229
  if not q_terms:
230
  return []
 
 
231
  candidates = set()
232
  for t in q_terms:
233
  for idx in bm25_inverted.get(t, []):
234
  candidates.add(idx)
235
  if not candidates:
 
236
  candidates = set(range(len(bm25_docs)))
 
237
  scored = []
238
  for idx in candidates:
239
  s = _bm25_score_for_doc(q_terms, idx)
 
242
  scored.sort(key=lambda x: x[1], reverse=True)
243
  return scored[:top_k]
244
 
245
+ # --------------------------- Semantic-only ---------------------------
246
  def search_knowledge_base(query: str, top_k: int = 10) -> dict:
 
 
 
 
 
247
  query_embedding = model.encode(query).tolist()
248
  res = collection.query(
249
  query_embeddings=[query_embedding],
250
  n_results=top_k,
251
+ include=['documents', 'metadatas', 'distances'] # no 'ids' here
252
  )
 
 
253
  docs_ll = res.get("documents", [[]]) or [[]]
254
  metas_ll = res.get("metadatas", [[]]) or [[]]
255
  dists_ll = res.get("distances", [[]]) or [[]]
256
+ ids_ll = res.get("ids", [[]]) or [[]]
257
 
258
  documents = docs_ll[0] if docs_ll else []
259
  metadatas = metas_ll[0] if metas_ll else []
260
  distances = dists_ll[0] if dists_ll else []
261
  ids = ids_ll[0] if ids_ll else []
262
 
 
263
  if not ids and documents:
264
  synthesized = []
265
  for i, m in enumerate(metadatas):
 
278
  "ids": ids,
279
  }
280
 
281
+ # --------------------------- Hybrid (BM25 + Embeddings + Intent + Action) ---------------------------
282
+ ACTION_SYNONYMS = {
283
+ "create": ["create", "creation", "add", "new", "generate"],
284
+ "update": ["update", "modify", "change", "edit"],
285
+ "delete": ["delete", "remove"],
286
+ "navigate": ["navigate", "go to", "open"],
287
+ "perform": ["perform", "execute", "do"],
288
+ }
 
 
 
 
 
 
 
 
 
289
 
290
  def _detect_user_intent(query: str) -> str:
291
  q = (query or "").lower()
 
299
  return "purpose"
300
  return "neutral"
301
 
302
+ def _extract_actions(query: str) -> List[str]:
303
+ q = (query or "").lower()
304
+ found = []
305
+ for act, syns in ACTION_SYNONYMS.items():
306
+ if any(s in q for s in syns):
307
+ found.append(act)
308
+ return found or []
309
+
310
  def _intent_weight(meta: dict, user_intent: str) -> float:
311
  tag = (meta or {}).get("intent_tag", "neutral")
312
  if user_intent == "neutral":
313
  return 0.0
314
  if tag == user_intent:
315
+ return 1.0
316
  if tag in ["purpose", "prereqs"] and user_intent in ["steps", "errors"]:
317
+ return -0.6
318
+ return -0.2
319
+
320
+ def _meta_overlap(meta: Dict[str, Any], q_terms: List[str]) -> float:
321
+ fn_tokens = _tokenize_meta_value(meta.get("filename"))
322
+ title_tokens = _tokenize_meta_value(meta.get("title"))
323
+ section_tokens = _tokenize_meta_value(meta.get("section"))
324
+ meta_tokens = set(fn_tokens + title_tokens + section_tokens)
325
+ if not meta_tokens or not q_terms:
326
+ return 0.0
327
+ qset = set(q_terms)
328
+ inter = len(meta_tokens & qset)
329
+ return inter / max(1, len(qset))
330
+
331
+ def _action_weight(text: str, actions: List[str]) -> float:
332
+ """
333
+ Boost if text contains target action verb(s); penalize if text dominated by other actions.
334
+ """
335
+ if not actions:
336
+ return 0.0
337
+ t = (text or "").lower()
338
+ score = 0.0
339
+ for act in actions:
340
+ for syn in ACTION_SYNONYMS.get(act, [act]):
341
+ if syn in t:
342
+ score += 1.0 # boost for each matching synonym
343
+ # Penalize conflicting actions: e.g., query 'create' but text has 'delete' heavily
344
+ conflicts = {"create": ["delete"], "delete": ["create"], "update": ["delete"], "navigate": [], "perform": []}
345
+ for act in actions:
346
+ for bad in conflicts.get(act, []):
347
+ for syn in ACTION_SYNONYMS.get(bad, [bad]):
348
+ if syn in t:
349
+ score -= 0.8
350
+ return score
351
 
352
  def hybrid_search_knowledge_base(query: str, top_k: int = 10, alpha: float = 0.6, beta: float = 0.4) -> dict:
353
  """
354
+ final = alpha * semantic_sim + beta * bm25_norm + gamma * meta_overlap + delta * intent_boost + epsilon * action_weight
355
+ + document-level voting prior.
 
 
 
 
 
 
 
 
356
  """
 
357
  norm_query = _normalize_query(query)
358
  q_terms = _tokenize(norm_query)
359
  user_intent = _detect_user_intent(query)
360
+ actions = _extract_actions(query) # NEW
361
 
 
362
  sem_res = search_knowledge_base(norm_query, top_k=max(top_k, 30))
363
  sem_docs = sem_res.get("documents", [])
364
  sem_metas = sem_res.get("metadatas", [])
365
  sem_dists = sem_res.get("distances", [])
366
  sem_ids = sem_res.get("ids", [])
367
 
 
368
  def dist_to_sim(d: Optional[float]) -> float:
369
  if d is None:
370
  return 0.0
 
375
 
376
  sem_sims = [dist_to_sim(d) for d in sem_dists]
377
 
 
378
  bm25_hits = bm25_search(norm_query, top_k=max(50, top_k * 5))
379
  bm25_max = max([s for _, s in bm25_hits], default=1.0)
380
  bm25_norm_pairs = [(idx, (score / bm25_max) if bm25_max > 0 else 0.0) for idx, score in bm25_hits]
381
 
382
+ bm25_id_to_norm, bm25_id_to_text, bm25_id_to_meta = {}, {}, {}
 
 
 
 
383
  for idx, nscore in bm25_norm_pairs:
384
  d = bm25_docs[idx]
385
  bm25_id_to_norm[d["id"]] = nscore
386
  bm25_id_to_text[d["id"]] = d["text"]
387
  bm25_id_to_meta[d["id"]] = d["meta"]
388
 
 
389
  union_ids = set(sem_ids) | set(bm25_id_to_norm.keys())
390
 
391
+ gamma = 0.25 # meta overlap
392
+ delta = 0.35 # intent boost
393
+ epsilon = 0.30 # action weight
 
394
 
395
+ combined_records_ext: List[Tuple[str, float, float, str, Dict[str, Any], float, float, float]] = []
396
  for cid in union_ids:
 
397
  if cid in sem_ids:
398
  pos = sem_ids.index(cid)
399
  sem_sim = sem_sims[pos] if pos < len(sem_sims) else 0.0
 
403
  else:
404
  sem_sim, sem_dist, sem_text, sem_meta = 0.0, None, "", {}
405
 
 
406
  bm25_sim = bm25_id_to_norm.get(cid, 0.0)
407
  bm25_text = bm25_id_to_text.get(cid, "")
408
  bm25_meta = bm25_id_to_meta.get(cid, {})
409
 
 
410
  text = sem_text if sem_text else bm25_text
411
  meta = sem_meta if sem_meta else bm25_meta
412
 
 
413
  m_overlap = _meta_overlap(meta, q_terms)
414
  intent_boost = _intent_weight(meta, user_intent)
415
+ act_wt = _action_weight(text, actions) # NEW
416
 
417
+ final_score = alpha * sem_sim + beta * bm25_sim + gamma * m_overlap + delta * intent_boost + epsilon * act_wt
 
418
 
419
  combined_records_ext.append(
420
+ (cid, final_score, (sem_dist if sem_dist is not None else 999.0), text, meta, m_overlap, intent_boost, act_wt)
421
  )
422
 
 
423
  from collections import defaultdict
424
+ doc_groups: Dict[str, List[Tuple[str, float, float, str, Dict[str, Any], float, float, float]]] = defaultdict(list)
425
  for rec in combined_records_ext:
426
  meta = rec[4] or {}
427
  fn = meta.get("filename", "unknown")
428
  doc_groups[fn].append(rec)
429
 
430
+ def doc_prior(recs: List[Tuple[str, float, float, str, Dict[str, Any], float, float, float]]) -> float:
 
431
  total_score = sum(r[1] for r in recs)
432
  total_overlap = sum(r[5] for r in recs)
433
+ total_intent = sum(max(0.0, r[6]) for r in recs)
434
+ total_action = sum(max(0.0, r[7]) for r in recs)
435
+ total_penalty = sum(min(0.0, r[6]) for r in recs) + sum(min(0.0, r[7]) for r in recs)
436
+ return total_score + 0.4 * total_overlap + 0.6 * total_intent + 0.5 * total_action + 0.3 * total_penalty
437
 
438
+ best_doc, best_doc_prior = None, -1.0
 
 
439
  for fn, recs in doc_groups.items():
440
  p = doc_prior(recs)
441
  if p > best_doc_prior:
442
+ best_doc_prior, best_doc = p, fn
 
443
 
 
444
  best_recs = sorted(doc_groups.get(best_doc, []), key=lambda x: x[1], reverse=True)
445
  other_recs = []
446
  for fn, recs in doc_groups.items():
 
464
  "distances": distances,
465
  "ids": ids,
466
  "combined_scores": combined_scores,
467
+ "best_doc": best_doc,
468
+ "best_doc_prior": best_doc_prior,
469
+ "user_intent": user_intent,
470
+ "actions": actions,
471
  }