srilakshu012456 commited on
Commit
f89383d
Β·
verified Β·
1 Parent(s): 18ab6d6

Update services/kb_creation.py

Browse files
Files changed (1) hide show
  1. services/kb_creation.py +126 -72
services/kb_creation.py CHANGED
@@ -6,31 +6,30 @@ from typing import List, Dict, Any, Tuple, Optional
6
  from docx import Document
7
  from sentence_transformers import SentenceTransformer
8
  import chromadb
9
-
10
- # ------------------------- ChromaDB setup -------------------------
11
  CHROMA_PATH = os.path.join(os.getcwd(), "chroma_db")
12
  client = chromadb.PersistentClient(path=CHROMA_PATH)
13
  collection = client.get_or_create_collection(name="knowledge_base")
14
 
15
- # ------------------------- Embedding model ------------------------
16
  # You can swap to a multilingual model if you expect mixed language queries:
17
  # model = SentenceTransformer('paraphrase-multilingual-MiniLM-L12-v2')
18
- #MODEL_PATH = './models/all-MiniLM-L6-v2'
19
- #model = SentenceTransformer(MODEL_PATH)
20
  model = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2")
21
 
22
- # ------------------------- BM25 (lightweight) ---------------------
23
  BM25_INDEX_FILE = os.path.join(CHROMA_PATH, "bm25_index.pkl")
24
-
25
- bm25_docs: List[Dict[str, Any]] = [] # each: {id, text, tokens, tf, length, meta}
26
  bm25_inverted: Dict[str, List[int]] = {} # term -> list of doc indices in bm25_docs
27
- bm25_df: Dict[str, int] = {} # term -> document frequency
28
  bm25_avgdl: float = 0.0
29
  bm25_ready: bool = False
30
  BM25_K1 = 1.5
31
  BM25_B = 0.75
32
 
33
- # ------------------------- Utilities ------------------------------
34
  def _tokenize(text: str) -> List[str]:
35
  """
36
  Simple tokenizer: lowercase alphanumeric words; removes most punctuation.
@@ -50,11 +49,20 @@ def _normalize_query(q: str) -> str:
50
  q = (q or "").strip().lower()
51
  q = re.sub(r"[^\w\s]", " ", q)
52
  # remove generic filler verbs/common noise words across English variants
53
- q = re.sub(r"\b(facing|get|getting|got|seeing|receiving|encountered|having|observing|issue|problem)\b", " ", q)
 
 
 
 
54
  q = re.sub(r"\s+", " ", q).strip()
55
  return q
56
 
57
- # ------------------------- DOCX parsing & chunking ----------------
 
 
 
 
 
58
  def _split_by_sections(doc: Document) -> List[Tuple[str, List[str]]]:
59
  """
60
  Split DOCX into (section_title, paragraphs_in_section).
@@ -64,12 +72,10 @@ def _split_by_sections(doc: Document) -> List[Tuple[str, List[str]]]:
64
  sections: List[Tuple[str, List[str]]] = []
65
  current_title = None
66
  current_paras: List[str] = []
67
-
68
  for para in doc.paragraphs:
69
  text = (para.text or "").strip()
70
  style_name = (para.style.name if para.style else "") or ""
71
  is_heading = bool(re.match(r"Heading\s*\d+", style_name, flags=re.IGNORECASE))
72
-
73
  if is_heading and text:
74
  # commit previous section
75
  if current_title or current_paras:
@@ -79,16 +85,13 @@ def _split_by_sections(doc: Document) -> List[Tuple[str, List[str]]]:
79
  else:
80
  if text:
81
  current_paras.append(text)
82
-
83
  # final section
84
  if current_title or current_paras:
85
  sections.append((current_title or "Untitled Section", current_paras))
86
-
87
  # in case no headings at all, make one pseudo-section with all text
88
  if not sections:
89
  all_text = [p.text.strip() for p in doc.paragraphs if p.text and p.text.strip()]
90
  sections = [("Document", all_text)]
91
-
92
  return sections
93
 
94
  def _chunk_text_with_context(doc_title: str, section_title: str, paragraphs: List[str], max_words: int = 900) -> List[str]:
@@ -109,7 +112,7 @@ def _chunk_text_with_context(doc_title: str, section_title: str, paragraphs: Lis
109
  chunks = [body]
110
  return chunks
111
 
112
- # ------------------------- Ingestion ------------------------------
113
  def ingest_documents(folder_path: str) -> None:
114
  """
115
  Read .docx files, section-aware chunking, generate embeddings, store in ChromaDB,
@@ -140,13 +143,11 @@ def ingest_documents(folder_path: str) -> None:
140
  for s_idx, (section_title, paras) in enumerate(sections):
141
  chunks = _chunk_text_with_context(doc_title, section_title, paras, max_words=900)
142
  total_chunks += len(chunks)
143
-
144
  for c_idx, chunk in enumerate(chunks):
145
  # Embedding & Chroma
146
  embedding = model.encode(chunk).tolist()
147
  doc_id = f"{file}:{s_idx}:{c_idx}" # stable unique id
148
  meta = {"filename": file, "section": section_title, "chunk_index": c_idx, "title": doc_title, "collection": "SOP"}
149
-
150
  try:
151
  collection.add(
152
  ids=[doc_id],
@@ -154,7 +155,7 @@ def ingest_documents(folder_path: str) -> None:
154
  documents=[chunk],
155
  metadatas=[meta],
156
  )
157
- except Exception as e:
158
  # upsert on duplicate
159
  try:
160
  collection.delete(ids=[doc_id])
@@ -190,20 +191,19 @@ def ingest_documents(folder_path: str) -> None:
190
  bm25_avgdl = sum(d["length"] for d in bm25_docs) / float(N)
191
  bm25_ready = True
192
 
193
- # persist BM25 index
194
- payload = {
195
- "bm25_docs": bm25_docs,
196
- "bm25_inverted": bm25_inverted,
197
- "bm25_df": bm25_df,
198
- "bm25_avgdl": bm25_avgdl,
199
- "BM25_K1": BM25_K1,
200
- "BM25_B": BM25_B,
201
- }
202
- os.makedirs(CHROMA_PATH, exist_ok=True)
203
- with open(BM25_INDEX_FILE, "wb") as f:
204
- pickle.dump(payload, f)
205
- print(f"βœ… BM25 index saved: {BM25_INDEX_FILE}")
206
-
207
  print(f"βœ… Documents ingested. Total entries in Chroma: {collection.count()}")
208
 
209
  def _load_bm25_index() -> None:
@@ -230,7 +230,7 @@ def _load_bm25_index() -> None:
230
  # auto-load on import
231
  _load_bm25_index()
232
 
233
- # ------------------------- BM25 search ----------------------------------------
234
  def _bm25_score_for_doc(query_terms: List[str], doc_idx: int) -> float:
235
  """
236
  Okapi BM25 score for a given doc.
@@ -249,18 +249,14 @@ def _bm25_score_for_doc(query_terms: List[str], doc_idx: int) -> float:
249
  continue
250
  # BM25 idf
251
  N = len(bm25_docs)
252
- idf = max(0.0, ( (N - df + 0.5) / (df + 0.5) ))
253
- idf = (idf if idf > 0 else 1.0)
254
- idf = 1.0 * ( (N - df + 0.5) / (df + 0.5) ) # raw ratio
255
- # typical log form
256
  try:
257
  import math
258
- idf = math.log(idf + 1.0)
259
  except Exception:
260
- pass
261
-
262
  denom = tf + BM25_K1 * (1 - BM25_B + BM25_B * (dl / (bm25_avgdl or 1.0)))
263
- score += idf * ( (tf * (BM25_K1 + 1)) / (denom or 1.0) )
264
  return score
265
 
266
  def bm25_search(query: str, top_k: int = 50) -> List[Tuple[int, float]]:
@@ -273,6 +269,7 @@ def bm25_search(query: str, top_k: int = 50) -> List[Tuple[int, float]]:
273
  q_terms = _tokenize(norm)
274
  if not q_terms:
275
  return []
 
276
  # collect candidate doc indices via inverted index
277
  candidates = set()
278
  for t in q_terms:
@@ -290,8 +287,7 @@ def bm25_search(query: str, top_k: int = 50) -> List[Tuple[int, float]]:
290
  scored.sort(key=lambda x: x[1], reverse=True)
291
  return scored[:top_k]
292
 
293
- # ------------------------- Semantic-only (legacy) ------------------------------
294
-
295
  def search_knowledge_base(query: str, top_k: int = 10) -> dict:
296
  """
297
  Semantic-only search (Chroma). We DO NOT ask for 'ids' in include
@@ -306,29 +302,28 @@ def search_knowledge_base(query: str, top_k: int = 10) -> dict:
306
  )
307
 
308
  # Flatten lists-per-query
309
- docs_ll = res.get("documents", [[]]) or [[]]
310
  metas_ll = res.get("metadatas", [[]]) or [[]]
311
  dists_ll = res.get("distances", [[]]) or [[]]
312
- ids_ll = res.get("ids", [[]]) or [[]] # some clients still return 'ids' anyway
313
 
314
- documents = docs_ll[0] if docs_ll else []
315
  metadatas = metas_ll[0] if metas_ll else []
316
  distances = dists_ll[0] if dists_ll else []
317
- ids = ids_ll[0] if ids_ll else []
318
 
319
  # If 'ids' is missing, synthesize stable IDs from metadata
320
  if not ids and documents:
321
  synthesized = []
322
  for i, m in enumerate(metadatas):
323
- fn = (m or {}).get("filename", "unknown")
324
- sec = (m or {}).get("section", "section")
325
- idx = (m or {}).get("chunk_index", i)
326
  synthesized.append(f"{fn}:{sec}:{idx}")
327
  ids = synthesized
328
 
329
  print(f"πŸ”Ž KB search β†’ {len(documents)} docs (top_k={top_k}); "
330
  f"first distance: {distances[0] if distances else 'n/a'}; ids={len(ids)}")
331
-
332
  return {
333
  "documents": documents,
334
  "metadatas": metadatas,
@@ -336,21 +331,39 @@ def search_knowledge_base(query: str, top_k: int = 10) -> dict:
336
  "ids": ids,
337
  }
338
 
339
- # ------------------------- Hybrid (BM25 + Embeddings) -------------------------
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
340
  def hybrid_search_knowledge_base(query: str, top_k: int = 10, alpha: float = 0.6, beta: float = 0.4) -> dict:
341
  """
342
  Hybrid retrieval:
343
  - Semantic (Chroma/embeddings) β†’ distances (lower = better) β†’ convert to similarity
344
  - BM25 keyword β†’ score (higher = better)
345
- - Re-rank union of candidates by: final = alpha * semantic_sim + beta * bm25_norm
346
-
347
- Returns a dict compatible with the extractor but also includes:
 
348
  - 'ids': list[str]
349
- - 'combined_scores': list[float] (0..1)
350
- - 'distances': list[float] from semantic (may be missing if fetched from BM25-only)
351
  """
352
- # 1) Normalize query (language-agnostic, no domain synonyms)
353
  norm_query = _normalize_query(query)
 
354
 
355
  # 2) Semantic candidates (Chroma)
356
  sem_res = search_knowledge_base(norm_query, top_k=max(top_k, 30))
@@ -377,21 +390,22 @@ def hybrid_search_knowledge_base(query: str, top_k: int = 10, alpha: float = 0.6
377
  bm25_norm_pairs = [(idx, (score / bm25_max) if bm25_max > 0 else 0.0) for idx, score in bm25_hits]
378
 
379
  # 4) Merge candidates by doc_id
380
- # For BM25 doc_idx β†’ get doc info
381
  bm25_id_to_norm: Dict[str, float] = {}
382
  bm25_id_to_text: Dict[str, str] = {}
383
  bm25_id_to_meta: Dict[str, Dict[str, Any]] = {}
 
384
  for idx, nscore in bm25_norm_pairs:
385
  d = bm25_docs[idx]
386
  bm25_id_to_norm[d["id"]] = nscore
387
  bm25_id_to_text[d["id"]] = d["text"]
388
  bm25_id_to_meta[d["id"]] = d["meta"]
389
 
390
- # Build union
391
  union_ids = set(sem_ids) | set(bm25_id_to_norm.keys())
392
 
393
- # 5) For each candidate id, compute combined score and collect fields
394
- combined_records: List[Tuple[str, float, float, str, Dict[str, Any]]] = []
 
 
395
  for cid in union_ids:
396
  # semantic part
397
  if cid in sem_ids:
@@ -412,18 +426,56 @@ def hybrid_search_knowledge_base(query: str, top_k: int = 10, alpha: float = 0.6
412
  text = sem_text if sem_text else bm25_text
413
  meta = sem_meta if sem_meta else bm25_meta
414
 
 
 
 
415
  # final combined score
416
- final_score = alpha * sem_sim + beta * bm25_sim
417
- combined_records.append((cid, final_score, (sem_dist if sem_dist is not None else 999.0), text, meta))
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
418
 
419
- # 6) Sort by combined score desc and take top_k
420
- combined_records.sort(key=lambda x: x[1], reverse=True)
421
- top = combined_records[:top_k]
422
 
423
  documents = [t[3] for t in top]
424
  metadatas = [t[4] for t in top]
425
- distances = [t[2] for t in top] # keep semantic distance (999 if BM25-only)
426
- ids = [t[0] for t in top]
427
  combined_scores = [t[1] for t in top]
428
 
429
  return {
@@ -432,4 +484,6 @@ def hybrid_search_knowledge_base(query: str, top_k: int = 10, alpha: float = 0.6
432
  "distances": distances,
433
  "ids": ids,
434
  "combined_scores": combined_scores,
 
 
435
  }
 
6
  from docx import Document
7
  from sentence_transformers import SentenceTransformer
8
  import chromadb
9
+ #updated
10
+ # --------------------------- ChromaDB setup ---------------------------
11
  CHROMA_PATH = os.path.join(os.getcwd(), "chroma_db")
12
  client = chromadb.PersistentClient(path=CHROMA_PATH)
13
  collection = client.get_or_create_collection(name="knowledge_base")
14
 
15
+ # --------------------------- Embedding model ---------------------------
16
  # You can swap to a multilingual model if you expect mixed language queries:
17
  # model = SentenceTransformer('paraphrase-multilingual-MiniLM-L12-v2')
18
+ # MODEL_PATH = './models/all-MiniLM-L6-v2'
19
+ # model = SentenceTransformer(MODEL_PATH)
20
  model = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2")
21
 
22
+ # --------------------------- BM25 (lightweight) ---------------------------
23
  BM25_INDEX_FILE = os.path.join(CHROMA_PATH, "bm25_index.pkl")
24
+ bm25_docs: List[Dict[str, Any]] = [] # each: {id, text, tokens, tf, length, meta}
 
25
  bm25_inverted: Dict[str, List[int]] = {} # term -> list of doc indices in bm25_docs
26
+ bm25_df: Dict[str, int] = {} # term -> document frequency
27
  bm25_avgdl: float = 0.0
28
  bm25_ready: bool = False
29
  BM25_K1 = 1.5
30
  BM25_B = 0.75
31
 
32
+ # --------------------------- Utilities ---------------------------
33
  def _tokenize(text: str) -> List[str]:
34
  """
35
  Simple tokenizer: lowercase alphanumeric words; removes most punctuation.
 
49
  q = (q or "").strip().lower()
50
  q = re.sub(r"[^\w\s]", " ", q)
51
  # remove generic filler verbs/common noise words across English variants
52
+ q = re.sub(
53
+ r"\b(facing|get|getting|got|seeing|receiving|encountered|having|observing|issue|problem)\b",
54
+ " ",
55
+ q,
56
+ )
57
  q = re.sub(r"\s+", " ", q).strip()
58
  return q
59
 
60
+ def _tokenize_meta_value(val: Optional[str]) -> List[str]:
61
+ if not val:
62
+ return []
63
+ return _tokenize(val)
64
+
65
+ # --------------------------- DOCX parsing & chunking ---------------------------
66
  def _split_by_sections(doc: Document) -> List[Tuple[str, List[str]]]:
67
  """
68
  Split DOCX into (section_title, paragraphs_in_section).
 
72
  sections: List[Tuple[str, List[str]]] = []
73
  current_title = None
74
  current_paras: List[str] = []
 
75
  for para in doc.paragraphs:
76
  text = (para.text or "").strip()
77
  style_name = (para.style.name if para.style else "") or ""
78
  is_heading = bool(re.match(r"Heading\s*\d+", style_name, flags=re.IGNORECASE))
 
79
  if is_heading and text:
80
  # commit previous section
81
  if current_title or current_paras:
 
85
  else:
86
  if text:
87
  current_paras.append(text)
 
88
  # final section
89
  if current_title or current_paras:
90
  sections.append((current_title or "Untitled Section", current_paras))
 
91
  # in case no headings at all, make one pseudo-section with all text
92
  if not sections:
93
  all_text = [p.text.strip() for p in doc.paragraphs if p.text and p.text.strip()]
94
  sections = [("Document", all_text)]
 
95
  return sections
96
 
97
  def _chunk_text_with_context(doc_title: str, section_title: str, paragraphs: List[str], max_words: int = 900) -> List[str]:
 
112
  chunks = [body]
113
  return chunks
114
 
115
+ # --------------------------- Ingestion ---------------------------
116
  def ingest_documents(folder_path: str) -> None:
117
  """
118
  Read .docx files, section-aware chunking, generate embeddings, store in ChromaDB,
 
143
  for s_idx, (section_title, paras) in enumerate(sections):
144
  chunks = _chunk_text_with_context(doc_title, section_title, paras, max_words=900)
145
  total_chunks += len(chunks)
 
146
  for c_idx, chunk in enumerate(chunks):
147
  # Embedding & Chroma
148
  embedding = model.encode(chunk).tolist()
149
  doc_id = f"{file}:{s_idx}:{c_idx}" # stable unique id
150
  meta = {"filename": file, "section": section_title, "chunk_index": c_idx, "title": doc_title, "collection": "SOP"}
 
151
  try:
152
  collection.add(
153
  ids=[doc_id],
 
155
  documents=[chunk],
156
  metadatas=[meta],
157
  )
158
+ except Exception:
159
  # upsert on duplicate
160
  try:
161
  collection.delete(ids=[doc_id])
 
191
  bm25_avgdl = sum(d["length"] for d in bm25_docs) / float(N)
192
  bm25_ready = True
193
 
194
+ # persist BM25 index
195
+ payload = {
196
+ "bm25_docs": bm25_docs,
197
+ "bm25_inverted": bm25_inverted,
198
+ "bm25_df": bm25_df,
199
+ "bm25_avgdl": bm25_avgdl,
200
+ "BM25_K1": BM25_K1,
201
+ "BM25_B": BM25_B,
202
+ }
203
+ os.makedirs(CHROMA_PATH, exist_ok=True)
204
+ with open(BM25_INDEX_FILE, "wb") as f:
205
+ pickle.dump(payload, f)
206
+ print(f"βœ… BM25 index saved: {BM25_INDEX_FILE}")
 
207
  print(f"βœ… Documents ingested. Total entries in Chroma: {collection.count()}")
208
 
209
  def _load_bm25_index() -> None:
 
230
  # auto-load on import
231
  _load_bm25_index()
232
 
233
+ # --------------------------- BM25 search ---------------------------
234
  def _bm25_score_for_doc(query_terms: List[str], doc_idx: int) -> float:
235
  """
236
  Okapi BM25 score for a given doc.
 
249
  continue
250
  # BM25 idf
251
  N = len(bm25_docs)
252
+ idf_ratio = ( (N - df + 0.5) / (df + 0.5) )
 
 
 
253
  try:
254
  import math
255
+ idf = math.log(idf_ratio + 1.0)
256
  except Exception:
257
+ idf = 1.0
 
258
  denom = tf + BM25_K1 * (1 - BM25_B + BM25_B * (dl / (bm25_avgdl or 1.0)))
259
+ score += idf * ((tf * (BM25_K1 + 1)) / (denom or 1.0))
260
  return score
261
 
262
  def bm25_search(query: str, top_k: int = 50) -> List[Tuple[int, float]]:
 
269
  q_terms = _tokenize(norm)
270
  if not q_terms:
271
  return []
272
+
273
  # collect candidate doc indices via inverted index
274
  candidates = set()
275
  for t in q_terms:
 
287
  scored.sort(key=lambda x: x[1], reverse=True)
288
  return scored[:top_k]
289
 
290
+ # --------------------------- Semantic-only (legacy) ---------------------------
 
291
  def search_knowledge_base(query: str, top_k: int = 10) -> dict:
292
  """
293
  Semantic-only search (Chroma). We DO NOT ask for 'ids' in include
 
302
  )
303
 
304
  # Flatten lists-per-query
305
+ docs_ll = res.get("documents", [[]]) or [[]]
306
  metas_ll = res.get("metadatas", [[]]) or [[]]
307
  dists_ll = res.get("distances", [[]]) or [[]]
308
+ ids_ll = res.get("ids", [[]]) or [[]] # some clients still return 'ids' anyway
309
 
310
+ documents = docs_ll[0] if docs_ll else []
311
  metadatas = metas_ll[0] if metas_ll else []
312
  distances = dists_ll[0] if dists_ll else []
313
+ ids = ids_ll[0] if ids_ll else []
314
 
315
  # If 'ids' is missing, synthesize stable IDs from metadata
316
  if not ids and documents:
317
  synthesized = []
318
  for i, m in enumerate(metadatas):
319
+ fn = (m or {}).get("filename", "unknown")
320
+ sec = (m or {}).get("section", "section")
321
+ idx = (m or {}).get("chunk_index", i)
322
  synthesized.append(f"{fn}:{sec}:{idx}")
323
  ids = synthesized
324
 
325
  print(f"πŸ”Ž KB search β†’ {len(documents)} docs (top_k={top_k}); "
326
  f"first distance: {distances[0] if distances else 'n/a'}; ids={len(ids)}")
 
327
  return {
328
  "documents": documents,
329
  "metadatas": metadatas,
 
331
  "ids": ids,
332
  }
333
 
334
+ # --------------------------- Hybrid (BM25 + Embeddings) ---------------------------
335
+ def _meta_overlap(meta: Dict[str, Any], q_terms: List[str]) -> float:
336
+ """
337
+ Automatic metadata overlap score (no manual module list).
338
+ Uses filename, title, and section tokens. Range ~0..1.
339
+ """
340
+ if not meta:
341
+ return 0.0
342
+ fn_tokens = _tokenize_meta_value(meta.get("filename"))
343
+ title_tokens = _tokenize_meta_value(meta.get("title"))
344
+ section_tokens = _tokenize_meta_value(meta.get("section"))
345
+ meta_tokens = set(fn_tokens + title_tokens + section_tokens)
346
+ if not meta_tokens or not q_terms:
347
+ return 0.0
348
+ qset = set(q_terms)
349
+ inter = len(meta_tokens & qset)
350
+ return inter / max(1, len(qset))
351
+
352
  def hybrid_search_knowledge_base(query: str, top_k: int = 10, alpha: float = 0.6, beta: float = 0.4) -> dict:
353
  """
354
  Hybrid retrieval:
355
  - Semantic (Chroma/embeddings) β†’ distances (lower = better) β†’ convert to similarity
356
  - BM25 keyword β†’ score (higher = better)
357
+ - Re-rank union of candidates by:
358
+ final = alpha * semantic_sim + beta * bm25_norm + gamma * meta_overlap
359
+ - Document-level voting prior: aggregate scores by 'filename' and prefer the best document first.
360
+ Returns a dict compatible with the extractor and includes:
361
  - 'ids': list[str]
362
+ - 'combined_scores': list[float] (0..1ish)
 
363
  """
364
+ # 1) Normalize query (language-agnostic)
365
  norm_query = _normalize_query(query)
366
+ q_terms = _tokenize(norm_query)
367
 
368
  # 2) Semantic candidates (Chroma)
369
  sem_res = search_knowledge_base(norm_query, top_k=max(top_k, 30))
 
390
  bm25_norm_pairs = [(idx, (score / bm25_max) if bm25_max > 0 else 0.0) for idx, score in bm25_hits]
391
 
392
  # 4) Merge candidates by doc_id
 
393
  bm25_id_to_norm: Dict[str, float] = {}
394
  bm25_id_to_text: Dict[str, str] = {}
395
  bm25_id_to_meta: Dict[str, Dict[str, Any]] = {}
396
+
397
  for idx, nscore in bm25_norm_pairs:
398
  d = bm25_docs[idx]
399
  bm25_id_to_norm[d["id"]] = nscore
400
  bm25_id_to_text[d["id"]] = d["text"]
401
  bm25_id_to_meta[d["id"]] = d["meta"]
402
 
 
403
  union_ids = set(sem_ids) | set(bm25_id_to_norm.keys())
404
 
405
+ gamma = 0.25 # metadata boost weight (tunable)
406
+
407
+ combined_records_ext: List[Tuple[str, float, float, str, Dict[str, Any], float]] = [] # include meta_overlap
408
+
409
  for cid in union_ids:
410
  # semantic part
411
  if cid in sem_ids:
 
426
  text = sem_text if sem_text else bm25_text
427
  meta = sem_meta if sem_meta else bm25_meta
428
 
429
+ # NEW: automatic metadata overlap (no manual lists)
430
+ m_overlap = _meta_overlap(meta, q_terms)
431
+
432
  # final combined score
433
+ final_score = alpha * sem_sim + beta * bm25_sim + gamma * m_overlap
434
+
435
+ combined_records_ext.append(
436
+ (cid, final_score, (sem_dist if sem_dist is not None else 999.0), text, meta, m_overlap)
437
+ )
438
+
439
+ # ---------------- Document-level voting prior ----------------
440
+ # Group by filename and compute aggregate doc score β†’ prefer best doc first
441
+ from collections import defaultdict
442
+ doc_groups: Dict[str, List[Tuple[str, float, float, str, Dict[str, Any], float]]] = defaultdict(list)
443
+ for rec in combined_records_ext:
444
+ meta = rec[4] or {}
445
+ fn = meta.get("filename", "unknown")
446
+ doc_groups[fn].append(rec)
447
+
448
+ # Compute doc_prior = sum(final_score) + small bonus for metadata overlap sum
449
+ def doc_prior(recs: List[Tuple[str, float, float, str, Dict[str, Any], float]]) -> float:
450
+ total_score = sum(r[1] for r in recs)
451
+ total_meta = sum(r[5] for r in recs)
452
+ return total_score + 0.4 * total_meta # 0.4 is tunable
453
+
454
+ # Pick best document
455
+ best_doc = None
456
+ best_doc_prior = -1.0
457
+ for fn, recs in doc_groups.items():
458
+ p = doc_prior(recs)
459
+ if p > best_doc_prior:
460
+ best_doc_prior = p
461
+ best_doc = fn
462
+
463
+ # Reorder: take items from best_doc first (sorted by score), then others
464
+ best_recs = sorted(doc_groups.get(best_doc, []), key=lambda x: x[1], reverse=True)
465
+ other_recs = []
466
+ for fn, recs in doc_groups.items():
467
+ if fn == best_doc:
468
+ continue
469
+ other_recs.extend(recs)
470
+ other_recs.sort(key=lambda x: x[1], reverse=True)
471
 
472
+ reordered = best_recs + other_recs
473
+ top = reordered[:top_k]
 
474
 
475
  documents = [t[3] for t in top]
476
  metadatas = [t[4] for t in top]
477
+ distances = [t[2] for t in top]
478
+ ids = [t[0] for t in top]
479
  combined_scores = [t[1] for t in top]
480
 
481
  return {
 
484
  "distances": distances,
485
  "ids": ids,
486
  "combined_scores": combined_scores,
487
+ "best_doc": best_doc, # helpful for debugging
488
+ "best_doc_prior": best_doc_prior, # helpful for debugging
489
  }