srilakshu012456 commited on
Commit
6f19669
·
verified ·
1 Parent(s): 7815846

Update services/kb_creation.py

Browse files
Files changed (1) hide show
  1. services/kb_creation.py +183 -111
services/kb_creation.py CHANGED
@@ -1,4 +1,3 @@
1
-
2
  import os
3
  import re
4
  import pickle
@@ -13,10 +12,10 @@ CHROMA_PATH = os.path.join(os.getcwd(), "chroma_db")
13
  client = chromadb.PersistentClient(path=CHROMA_PATH)
14
  collection = client.get_or_create_collection(name="knowledge_base")
15
 
16
- # --------------------------- Embedding model ---------------------------
17
  model = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2")
18
 
19
- # --------------------------- BM25 (lightweight) ---------------------------
20
  BM25_INDEX_FILE = os.path.join(CHROMA_PATH, "bm25_index.pkl")
21
  bm25_docs: List[Dict[str, Any]] = []
22
  bm25_inverted: Dict[str, List[int]] = {}
@@ -26,16 +25,18 @@ bm25_ready: bool = False
26
  BM25_K1 = 1.5
27
  BM25_B = 0.75
28
 
29
- # --------------------------- Utilities ---------------------------
30
  def _tokenize(text: str) -> List[str]:
31
  if not text:
32
  return []
33
  text = text.lower()
34
  return re.findall(r"[a-z0-9]+", text)
35
 
 
36
  def _normalize_query(q: str) -> str:
37
  q = (q or "").strip().lower()
38
  q = re.sub(r"[^\w\s]", " ", q)
 
39
  q = re.sub(
40
  r"\b(facing|get|getting|got|seeing|receiving|encountered|having|observing|issue|problem)\b",
41
  " ",
@@ -44,43 +45,12 @@ def _normalize_query(q: str) -> str:
44
  q = re.sub(r"\s+", " ", q).strip()
45
  return q
46
 
 
47
  def _tokenize_meta_value(val: Optional[str]) -> List[str]:
48
  return _tokenize(val or "")
49
 
50
- # --------------------------- Semantic intent prototypes ---------------------------
51
- INTENT_PROTOTYPES: Dict[str, str] = {
52
- "steps": "Step-by-step procedure with actions the user must perform",
53
- "navigation": "Menu paths and locations in WMS, for example Navigate to Inbound > Receiving",
54
- "errors": "Common errors and resolution tips or troubleshooting guidance",
55
- "prereqs": "Pre-requisites, authorization, requirements before executing steps",
56
- "purpose": "Purpose, overview, introduction that explains why something is done",
57
- "escalation": "Escalation path or who to contact if the issue cannot be resolved",
58
- "permission": "User lacks authorization or access denied and needs role access check",
59
- }
60
-
61
- # Precompute prototype embeddings once
62
- PROTO_EMBS: Dict[str, List[float]] = {label: model.encode(text).tolist() for label, text in INTENT_PROTOTYPES.items()}
63
-
64
- def _embed(txt: str) -> List[float]:
65
- return model.encode((txt or "").strip()).tolist()
66
 
67
- def _cos_sim(a: List[float], b: List[float]) -> float:
68
- # pure-python cosine similarity
69
- dot = sum(x * y for x, y in zip(a, b))
70
- na = math.sqrt(sum(x * x for x in a)) + 1e-9
71
- nb = math.sqrt(sum(y * y for y in b)) + 1e-9
72
- return float(dot / (na * nb))
73
-
74
- def detect_user_intent(query: str) -> Tuple[str, float]:
75
- q_vec = _embed(query or "")
76
- best, best_s = "neutral", 0.0
77
- for label, proto_vec in PROTO_EMBS.items():
78
- s = _cos_sim(q_vec, proto_vec)
79
- if s > best_s:
80
- best, best_s = label, s
81
- return best, best_s # (intent label, confidence approx 0..1)
82
-
83
- # --------------------------- DOCX parsing & chunking ---------------------------
84
  def _split_by_sections(doc: Document) -> List[Tuple[str, List[str]]]:
85
  sections: List[Tuple[str, List[str]]] = []
86
  current_title = None
@@ -104,8 +74,8 @@ def _split_by_sections(doc: Document) -> List[Tuple[str, List[str]]]:
104
  sections = [("Document", all_text)]
105
  return sections
106
 
 
107
  def _chunk_text_with_context(doc_title: str, section_title: str, paragraphs: List[str], max_words: int = 900) -> List[str]:
108
- # Store only body text (no titles/headers in chunk) so users never see SOP headers
109
  body = "\n".join(paragraphs).strip()
110
  if not body:
111
  return []
@@ -119,7 +89,22 @@ def _chunk_text_with_context(doc_title: str, section_title: str, paragraphs: Lis
119
  chunks = [body]
120
  return chunks
121
 
122
- # --------------------------- Ingestion ---------------------------
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
123
  def ingest_documents(folder_path: str) -> None:
124
  print(f"📂 Checking folder: {folder_path}")
125
  files = [f for f in os.listdir(folder_path) if f.lower().endswith('.docx')]
@@ -138,20 +123,10 @@ def ingest_documents(folder_path: str) -> None:
138
  doc = Document(file_path)
139
  sections = _split_by_sections(doc)
140
  total_chunks = 0
141
-
142
  for s_idx, (section_title, paras) in enumerate(sections):
143
  chunks = _chunk_text_with_context(doc_title, section_title, paras, max_words=900)
144
  total_chunks += len(chunks)
145
-
146
- # --- Semantic section intent tagging (no keywords to maintain) ---
147
- section_text_for_tag = (section_title or "") + "\n" + ("\n".join(paras[:6]) if paras else "")
148
- sec_vec = _embed(section_text_for_tag)
149
- best_intent, best_score = "neutral", 0.0
150
- for label, proto_vec in PROTO_EMBS.items():
151
- s = _cos_sim(sec_vec, proto_vec)
152
- if s > best_score:
153
- best_intent, best_score = label, s
154
-
155
  for c_idx, chunk in enumerate(chunks):
156
  embedding = model.encode(chunk).tolist()
157
  doc_id = f"{file}:{s_idx}:{c_idx}"
@@ -161,8 +136,7 @@ def ingest_documents(folder_path: str) -> None:
161
  "chunk_index": c_idx,
162
  "title": doc_title,
163
  "collection": "SOP",
164
- "intent_tag": best_intent,
165
- "intent_score": best_score,
166
  }
167
  try:
168
  collection.add(ids=[doc_id], embeddings=[embedding], documents=[chunk], metadatas=[meta])
@@ -173,28 +147,24 @@ def ingest_documents(folder_path: str) -> None:
173
  except Exception as e2:
174
  print(f"❌ Upsert failed for {doc_id}: {e2}")
175
 
176
- # BM25 indexing
177
  tokens = _tokenize(chunk)
178
  tf: Dict[str, int] = {}
179
  for t in tokens:
180
  tf[t] = tf.get(t, 0) + 1
181
  idx = len(bm25_docs)
182
  bm25_docs.append({"id": doc_id, "text": chunk, "tokens": tokens, "tf": tf, "length": len(tokens), "meta": meta})
183
-
184
  seen = set()
185
  for term in tf.keys():
186
  bm25_inverted.setdefault(term, []).append(idx)
187
  if term not in seen:
188
  bm25_df[term] = bm25_df.get(term, 0) + 1
189
  seen.add(term)
190
-
191
  print(f"📄 Ingested {file} → {total_chunks} chunks")
192
 
193
  N = len(bm25_docs)
194
  if N > 0:
195
  bm25_avgdl = sum(d["length"] for d in bm25_docs) / float(N)
196
  bm25_ready = True
197
-
198
  payload = {
199
  "bm25_docs": bm25_docs,
200
  "bm25_inverted": bm25_inverted,
@@ -209,6 +179,7 @@ def ingest_documents(folder_path: str) -> None:
209
  print(f"✅ BM25 index saved: {BM25_INDEX_FILE}")
210
  print(f"✅ Documents ingested. Total entries in Chroma: {collection.count()}")
211
 
 
212
  def _load_bm25_index() -> None:
213
  global bm25_docs, bm25_inverted, bm25_df, bm25_avgdl, bm25_ready
214
  if not os.path.exists(BM25_INDEX_FILE):
@@ -226,9 +197,11 @@ def _load_bm25_index() -> None:
226
  except Exception as e:
227
  print(f"⚠️ Could not load BM25 index: {e}")
228
 
 
229
  _load_bm25_index()
230
 
231
- # --------------------------- BM25 search ---------------------------
 
232
  def _bm25_score_for_doc(query_terms: List[str], doc_idx: int) -> float:
233
  if not bm25_ready or doc_idx < 0 or doc_idx >= len(bm25_docs):
234
  return 0.0
@@ -252,6 +225,7 @@ def _bm25_score_for_doc(query_terms: List[str], doc_idx: int) -> float:
252
  score += idf * ((tf * (BM25_K1 + 1)) / (denom or 1.0))
253
  return score
254
 
 
255
  def bm25_search(query: str, top_k: int = 50) -> List[Tuple[int, float]]:
256
  if not bm25_ready:
257
  return []
@@ -273,35 +247,19 @@ def bm25_search(query: str, top_k: int = 50) -> List[Tuple[int, float]]:
273
  scored.sort(key=lambda x: x[1], reverse=True)
274
  return scored[:top_k]
275
 
276
- # --------------------------- Semantic-only (Chroma) ---------------------------
 
277
  def search_knowledge_base(query: str, top_k: int = 10) -> dict:
278
  query_embedding = model.encode(query).tolist()
279
  res = collection.query(
280
  query_embeddings=[query_embedding],
281
  n_results=top_k,
282
- include=['documents', 'metadatas', 'distances']
283
  )
284
- docs_ll = res.get("documents", [[]]) or [[]]
285
- metas_ll = res.get("metadatas", [[]]) or [[]]
286
- dists_ll = res.get("distances", [[]]) or [[]]
287
- ids_ll = res.get("ids", [[]]) or [[]]
288
-
289
- documents = docs_ll[0] if docs_ll else []
290
- metadatas = metas_ll[0] if metas_ll else []
291
- distances = dists_ll[0] if dists_ll else []
292
- ids = ids_ll[0] if ids_ll else []
293
-
294
- if not ids and documents:
295
- synthesized = []
296
- for i, m in enumerate(metadatas):
297
- fn = (m or {}).get("filename", "unknown")
298
- sec = (m or {}).get("section", "section")
299
- idx = (m or {}).get("chunk_index", i)
300
- synthesized.append(f"{fn}:{sec}:{idx}")
301
- ids = synthesized
302
-
303
- print(f"🔎 KB search → {len(documents)} docs (top_k={top_k}); "
304
- f"first distance: {distances[0] if distances else 'n/a'}; ids={len(ids)}")
305
  return {
306
  "documents": documents,
307
  "metadatas": metadatas,
@@ -309,7 +267,77 @@ def search_knowledge_base(query: str, top_k: int = 10) -> dict:
309
  "ids": ids,
310
  }
311
 
312
- # --------------------------- Hybrid (BM25 + Embeddings + Semantic Intent) ---------------------------
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
313
  def _meta_overlap(meta: Dict[str, Any], q_terms: List[str]) -> float:
314
  fn_tokens = _tokenize_meta_value(meta.get("filename"))
315
  title_tokens = _tokenize_meta_value(meta.get("title"))
@@ -321,11 +349,48 @@ def _meta_overlap(meta: Dict[str, Any], q_terms: List[str]) -> float:
321
  inter = len(meta_tokens & qset)
322
  return inter / max(1, len(qset))
323
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
324
  def hybrid_search_knowledge_base(query: str, top_k: int = 10, alpha: float = 0.6, beta: float = 0.4) -> dict:
325
  norm_query = _normalize_query(query)
326
  q_terms = _tokenize(norm_query)
327
- user_intent, intent_conf = detect_user_intent(query) # semantic
328
 
 
 
 
 
 
 
 
 
329
  sem_res = search_knowledge_base(norm_query, top_k=max(top_k, 30))
330
  sem_docs = sem_res.get("documents", [])
331
  sem_metas = sem_res.get("metadatas", [])
@@ -342,10 +407,10 @@ def hybrid_search_knowledge_base(query: str, top_k: int = 10, alpha: float = 0.6
342
 
343
  sem_sims = [dist_to_sim(d) for d in sem_dists]
344
 
 
345
  bm25_hits = bm25_search(norm_query, top_k=max(50, top_k * 5))
346
  bm25_max = max([s for _, s in bm25_hits], default=1.0)
347
  bm25_norm_pairs = [(idx, (score / bm25_max) if bm25_max > 0 else 0.0) for idx, score in bm25_hits]
348
-
349
  bm25_id_to_norm, bm25_id_to_text, bm25_id_to_meta = {}, {}, {}
350
  for idx, nscore in bm25_norm_pairs:
351
  d = bm25_docs[idx]
@@ -355,8 +420,13 @@ def hybrid_search_knowledge_base(query: str, top_k: int = 10, alpha: float = 0.6
355
 
356
  union_ids = set(sem_ids) | set(bm25_id_to_norm.keys())
357
 
358
- gamma = 0.25 # metadata overlap weight
359
- combined_records_ext: List[Tuple[str, float, float, str, Dict[str, Any], float, float]] = [] # id, score, dist, text, meta, overlap, intentBoost
 
 
 
 
 
360
 
361
  for cid in union_ids:
362
  if cid in sem_ids:
@@ -375,38 +445,38 @@ def hybrid_search_knowledge_base(query: str, top_k: int = 10, alpha: float = 0.6
375
  text = sem_text if sem_text else bm25_text
376
  meta = sem_meta if sem_meta else bm25_meta
377
 
378
- m_overlap = _meta_overlap(meta, q_terms)
379
- tag = (meta or {}).get("intent_tag", "neutral")
380
- tag_conf = float((meta or {}).get("intent_score", 0.0))
381
-
382
- # Semantic intent boost (no keyword list)
383
- intent_boost = 0.0
384
- if user_intent != "neutral":
385
- if tag == user_intent:
386
- intent_boost = 0.7 * (0.5 + 0.5 * tag_conf) # stronger if section is confidently tagged
387
- elif tag_conf > 0.4:
388
- intent_boost = -0.3 * tag_conf # soft penalty if clearly different and confident
389
-
390
- final_score = alpha * sem_sim + beta * bm25_sim + gamma * m_overlap + intent_boost
391
-
392
  combined_records_ext.append(
393
- (cid, final_score, (sem_dist if sem_dist is not None else 999.0), text, meta, m_overlap, intent_boost)
394
  )
395
 
396
- # ---------------- Document-level voting prior ----------------
397
  from collections import defaultdict
398
- doc_groups: Dict[str, List[Tuple[str, float, float, str, Dict[str, Any], float, float]]] = defaultdict(list)
399
  for rec in combined_records_ext:
400
  meta = rec[4] or {}
401
  fn = meta.get("filename", "unknown")
402
  doc_groups[fn].append(rec)
403
 
404
- def doc_prior(recs: List[Tuple[str, float, float, str, Dict[str, Any], float, float]]) -> float:
405
  total_score = sum(r[1] for r in recs)
406
  total_overlap = sum(r[5] for r in recs)
407
- total_intent = sum(max(0.0, r[6]) for r in recs) # positive boosts
408
- total_penalty = sum(min(0.0, r[6]) for r in recs) # penalties
409
- return total_score + 0.4 * total_overlap + 0.6 * total_intent + 0.3 * total_penalty
 
 
410
 
411
  best_doc, best_doc_prior = None, -1.0
412
  for fn, recs in doc_groups.items():
@@ -421,10 +491,8 @@ def hybrid_search_knowledge_base(query: str, top_k: int = 10, alpha: float = 0.6
421
  continue
422
  other_recs.extend(recs)
423
  other_recs.sort(key=lambda x: x[1], reverse=True)
424
-
425
  reordered = best_recs + other_recs
426
  top = reordered[:top_k]
427
-
428
  documents = [t[3] for t in top]
429
  metadatas = [t[4] for t in top]
430
  distances = [t[2] for t in top]
@@ -440,10 +508,11 @@ def hybrid_search_knowledge_base(query: str, top_k: int = 10, alpha: float = 0.6
440
  "best_doc": best_doc,
441
  "best_doc_prior": best_doc_prior,
442
  "user_intent": user_intent,
443
- "user_intent_conf": intent_conf,
444
  }
445
 
446
- # --------------------------- Section fetch helpers ---------------------------
 
447
  def get_section_text(filename: str, section: str) -> str:
448
  """Concatenate all chunk texts for a given filename+section."""
449
  texts: List[str] = []
@@ -455,6 +524,7 @@ def get_section_text(filename: str, section: str) -> str:
455
  texts.append(t)
456
  return "\n\n".join(texts).strip()
457
 
 
458
  def get_best_steps_section_text(filename: str) -> str:
459
  """Return combined text of all 'steps' sections in the given SOP (filename)."""
460
  texts: List[str] = []
@@ -466,7 +536,8 @@ def get_best_steps_section_text(filename: str) -> str:
466
  texts.append(t)
467
  return "\n\n".join(texts).strip()
468
 
469
- # --------------------------- Admin helpers ---------------------------
 
470
  def get_kb_runtime_info() -> Dict[str, Any]:
471
  return {
472
  "chroma_path": CHROMA_PATH,
@@ -477,6 +548,7 @@ def get_kb_runtime_info() -> Dict[str, Any]:
477
  "bm25_ready": bm25_ready,
478
  }
479
 
 
480
  def reset_kb(folder_path: str) -> Dict[str, Any]:
481
  result = {"status": "OK", "message": "KB reset and re-ingested"}
482
  try:
 
 
1
  import os
2
  import re
3
  import pickle
 
12
  client = chromadb.PersistentClient(path=CHROMA_PATH)
13
  collection = client.get_or_create_collection(name="knowledge_base")
14
 
15
+ # --------------------------- Embedding model --------------------------
16
  model = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2")
17
 
18
+ # --------------------------- BM25 (lightweight) -----------------------
19
  BM25_INDEX_FILE = os.path.join(CHROMA_PATH, "bm25_index.pkl")
20
  bm25_docs: List[Dict[str, Any]] = []
21
  bm25_inverted: Dict[str, List[int]] = {}
 
25
  BM25_K1 = 1.5
26
  BM25_B = 0.75
27
 
28
+ # --------------------------- Utilities --------------------------------
29
  def _tokenize(text: str) -> List[str]:
30
  if not text:
31
  return []
32
  text = text.lower()
33
  return re.findall(r"[a-z0-9]+", text)
34
 
35
+
36
  def _normalize_query(q: str) -> str:
37
  q = (q or "").strip().lower()
38
  q = re.sub(r"[^\w\s]", " ", q)
39
+ # remove filler issue words
40
  q = re.sub(
41
  r"\b(facing|get|getting|got|seeing|receiving|encountered|having|observing|issue|problem)\b",
42
  " ",
 
45
  q = re.sub(r"\s+", " ", q).strip()
46
  return q
47
 
48
+
49
  def _tokenize_meta_value(val: Optional[str]) -> List[str]:
50
  return _tokenize(val or "")
51
 
52
+ # ---------------------- DOCX parsing & chunking -----------------------
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
53
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
54
  def _split_by_sections(doc: Document) -> List[Tuple[str, List[str]]]:
55
  sections: List[Tuple[str, List[str]]] = []
56
  current_title = None
 
74
  sections = [("Document", all_text)]
75
  return sections
76
 
77
+
78
  def _chunk_text_with_context(doc_title: str, section_title: str, paragraphs: List[str], max_words: int = 900) -> List[str]:
 
79
  body = "\n".join(paragraphs).strip()
80
  if not body:
81
  return []
 
89
  chunks = [body]
90
  return chunks
91
 
92
+ # ---------------------- Intent tagging (section-based) ----------------
93
+
94
+ def _infer_intent_tag(section_title: str) -> str:
95
+ st = (section_title or "").lower()
96
+ if any(k in st for k in ["process steps", "procedure", "how to", "workflow", "instructions"]):
97
+ return "steps"
98
+ if any(k in st for k in ["common errors", "resolution", "troubleshooting"]):
99
+ return "errors"
100
+ if any(k in st for k in ["pre-requisites", "prerequisites"]):
101
+ return "prereqs"
102
+ if any(k in st for k in ["purpose", "overview", "introduction"]):
103
+ return "purpose"
104
+ return "neutral"
105
+
106
+ # ---------------------- Ingestion ------------------------------------
107
+
108
  def ingest_documents(folder_path: str) -> None:
109
  print(f"📂 Checking folder: {folder_path}")
110
  files = [f for f in os.listdir(folder_path) if f.lower().endswith('.docx')]
 
123
  doc = Document(file_path)
124
  sections = _split_by_sections(doc)
125
  total_chunks = 0
 
126
  for s_idx, (section_title, paras) in enumerate(sections):
127
  chunks = _chunk_text_with_context(doc_title, section_title, paras, max_words=900)
128
  total_chunks += len(chunks)
129
+ intent_tag = _infer_intent_tag(section_title)
 
 
 
 
 
 
 
 
 
130
  for c_idx, chunk in enumerate(chunks):
131
  embedding = model.encode(chunk).tolist()
132
  doc_id = f"{file}:{s_idx}:{c_idx}"
 
136
  "chunk_index": c_idx,
137
  "title": doc_title,
138
  "collection": "SOP",
139
+ "intent_tag": intent_tag,
 
140
  }
141
  try:
142
  collection.add(ids=[doc_id], embeddings=[embedding], documents=[chunk], metadatas=[meta])
 
147
  except Exception as e2:
148
  print(f"❌ Upsert failed for {doc_id}: {e2}")
149
 
 
150
  tokens = _tokenize(chunk)
151
  tf: Dict[str, int] = {}
152
  for t in tokens:
153
  tf[t] = tf.get(t, 0) + 1
154
  idx = len(bm25_docs)
155
  bm25_docs.append({"id": doc_id, "text": chunk, "tokens": tokens, "tf": tf, "length": len(tokens), "meta": meta})
 
156
  seen = set()
157
  for term in tf.keys():
158
  bm25_inverted.setdefault(term, []).append(idx)
159
  if term not in seen:
160
  bm25_df[term] = bm25_df.get(term, 0) + 1
161
  seen.add(term)
 
162
  print(f"📄 Ingested {file} → {total_chunks} chunks")
163
 
164
  N = len(bm25_docs)
165
  if N > 0:
166
  bm25_avgdl = sum(d["length"] for d in bm25_docs) / float(N)
167
  bm25_ready = True
 
168
  payload = {
169
  "bm25_docs": bm25_docs,
170
  "bm25_inverted": bm25_inverted,
 
179
  print(f"✅ BM25 index saved: {BM25_INDEX_FILE}")
180
  print(f"✅ Documents ingested. Total entries in Chroma: {collection.count()}")
181
 
182
+
183
  def _load_bm25_index() -> None:
184
  global bm25_docs, bm25_inverted, bm25_df, bm25_avgdl, bm25_ready
185
  if not os.path.exists(BM25_INDEX_FILE):
 
197
  except Exception as e:
198
  print(f"⚠️ Could not load BM25 index: {e}")
199
 
200
+
201
  _load_bm25_index()
202
 
203
+ # ---------------------- BM25 search ----------------------------------
204
+
205
  def _bm25_score_for_doc(query_terms: List[str], doc_idx: int) -> float:
206
  if not bm25_ready or doc_idx < 0 or doc_idx >= len(bm25_docs):
207
  return 0.0
 
225
  score += idf * ((tf * (BM25_K1 + 1)) / (denom or 1.0))
226
  return score
227
 
228
+
229
  def bm25_search(query: str, top_k: int = 50) -> List[Tuple[int, float]]:
230
  if not bm25_ready:
231
  return []
 
247
  scored.sort(key=lambda x: x[1], reverse=True)
248
  return scored[:top_k]
249
 
250
+ # ---------------------- Semantic-only --------------------------------
251
+
252
  def search_knowledge_base(query: str, top_k: int = 10) -> dict:
253
  query_embedding = model.encode(query).tolist()
254
  res = collection.query(
255
  query_embeddings=[query_embedding],
256
  n_results=top_k,
257
+ include=['documents', 'metadatas', 'distances', 'ids']
258
  )
259
+ documents = (res.get("documents", [[]]) or [[]])[0]
260
+ metadatas = (res.get("metadatas", [[]]) or [[]])[0]
261
+ distances = (res.get("distances", [[]]) or [[]])[0]
262
+ ids = (res.get("ids", [[]]) or [[]])[0]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
263
  return {
264
  "documents": documents,
265
  "metadatas": metadatas,
 
267
  "ids": ids,
268
  }
269
 
270
+ # ---------------------- Semantic intent + Hybrid ranking --------------
271
+
272
+ # Semantic intent prototypes (generic, wording-agnostic)
273
+ INTENT_PROTOTYPES = {
274
+ "steps": [
275
+ "how to perform", "procedure", "workflow", "instructions",
276
+ "steps to accomplish", "operate", "process to follow"
277
+ ],
278
+ "errors": [
279
+ "error condition", "issue troubleshooting", "resolution steps",
280
+ "fix failure", "diagnose problem"
281
+ ],
282
+ "prereqs": [
283
+ "pre-requisites", "requirements before starting", "setup needed"
284
+ ],
285
+ "purpose": [
286
+ "overview", "purpose", "introduction", "what is this about"
287
+ ],
288
+ "neutral": ["general information", "context", "details"],
289
+ }
290
+
291
+ INTENT_PROTO_VECS = {name: model.encode(" ; ".join(phrases)).tolist() for name, phrases in INTENT_PROTOTYPES.items()}
292
+
293
+
294
+ def _cosine(a: list, b: list) -> float:
295
+ if not a or not b or len(a) != len(b):
296
+ return 0.0
297
+ dot = sum(x * y for x, y in zip(a, b))
298
+ na = math.sqrt(sum(x * x for x in a)) or 1.0
299
+ nb = math.sqrt(sum(y * y for y in b)) or 1.0
300
+ return dot / (na * nb)
301
+
302
+
303
+ def classify_intent_semantic(query: str, min_margin: float = 0.08) -> str:
304
+ """Meaning-based intent classification using sentence embeddings."""
305
+ qv = model.encode((query or "").strip()).tolist()
306
+ scores = {name: _cosine(qv, vec) for name, vec in INTENT_PROTO_VECS.items()}
307
+ best = max(scores.items(), key=lambda kv: kv[1])
308
+ second = sorted(scores.values(), reverse=True)[1] if len(scores) > 1 else 0.0
309
+ if best[1] - second >= min_margin:
310
+ return best[0] if best[0] != "neutral" else "neutral"
311
+ return "neutral"
312
+
313
+ ACTION_SYNONYMS = {
314
+ "create": ["create", "creation", "add", "new", "generate"],
315
+ "update": ["update", "modify", "change", "edit"],
316
+ "delete": ["delete", "remove"],
317
+ "navigate": ["navigate", "go to", "open"],
318
+ }
319
+
320
+
321
+ def _extract_actions(query: str) -> List[str]:
322
+ q = (query or "").lower()
323
+ found = []
324
+ for act, syns in ACTION_SYNONYMS.items():
325
+ if any(s in q for s in syns):
326
+ found.append(act)
327
+ return found or []
328
+
329
+
330
+ def _intent_weight(meta: dict, user_intent: str) -> float:
331
+ tag = (meta or {}).get("intent_tag", "neutral")
332
+ if user_intent == "neutral":
333
+ return 0.0
334
+ if tag == user_intent:
335
+ return 1.0
336
+ if tag in ["purpose", "prereqs"] and user_intent in ["steps", "errors"]:
337
+ return -0.6
338
+ return -0.2
339
+
340
+
341
  def _meta_overlap(meta: Dict[str, Any], q_terms: List[str]) -> float:
342
  fn_tokens = _tokenize_meta_value(meta.get("filename"))
343
  title_tokens = _tokenize_meta_value(meta.get("title"))
 
349
  inter = len(meta_tokens & qset)
350
  return inter / max(1, len(qset))
351
 
352
+
353
+ def _semantic_meta_overlap(meta: Dict[str, Any], query_vec: List[float]) -> float:
354
+ """Compare query vector to semantic vector of filename/title/section."""
355
+ if not meta:
356
+ return 0.0
357
+ s = " ".join([str(meta.get("filename", "")), str(meta.get("title", "")), str(meta.get("section", ""))]).strip()
358
+ if not s:
359
+ return 0.0
360
+ mv = model.encode(s).tolist()
361
+ return max(0.0, _cosine(query_vec, mv))
362
+
363
+
364
+ def _action_weight(text: str, actions: List[str]) -> float:
365
+ if not actions:
366
+ return 0.0
367
+ t = (text or "").lower()
368
+ score = 0.0
369
+ for act in actions:
370
+ for syn in ACTION_SYNONYMS.get(act, [act]):
371
+ if syn in t:
372
+ score += 1.0
373
+ conflicts = {"create": ["delete"], "delete": ["create"], "update": ["delete"], "navigate": []}
374
+ for act in actions:
375
+ for bad in conflicts.get(act, []):
376
+ for syn in ACTION_SYNONYMS.get(bad, [bad]):
377
+ if syn in t:
378
+ score -= 0.8
379
+ return score
380
+
381
+
382
  def hybrid_search_knowledge_base(query: str, top_k: int = 10, alpha: float = 0.6, beta: float = 0.4) -> dict:
383
  norm_query = _normalize_query(query)
384
  q_terms = _tokenize(norm_query)
 
385
 
386
+ # semantic intent
387
+ user_intent = classify_intent_semantic(query)
388
+ actions = _extract_actions(query)
389
+
390
+ # query vector
391
+ query_vec = model.encode(norm_query).tolist()
392
+
393
+ # semantic results
394
  sem_res = search_knowledge_base(norm_query, top_k=max(top_k, 30))
395
  sem_docs = sem_res.get("documents", [])
396
  sem_metas = sem_res.get("metadatas", [])
 
407
 
408
  sem_sims = [dist_to_sim(d) for d in sem_dists]
409
 
410
+ # bm25 results
411
  bm25_hits = bm25_search(norm_query, top_k=max(50, top_k * 5))
412
  bm25_max = max([s for _, s in bm25_hits], default=1.0)
413
  bm25_norm_pairs = [(idx, (score / bm25_max) if bm25_max > 0 else 0.0) for idx, score in bm25_hits]
 
414
  bm25_id_to_norm, bm25_id_to_text, bm25_id_to_meta = {}, {}, {}
415
  for idx, nscore in bm25_norm_pairs:
416
  d = bm25_docs[idx]
 
420
 
421
  union_ids = set(sem_ids) | set(bm25_id_to_norm.keys())
422
 
423
+ # weights
424
+ gamma = 0.25 # lexical meta overlap
425
+ delta = 0.35 # intent boost
426
+ epsilon = 0.25 # action weight
427
+ zeta = 0.35 # semantic meta similarity
428
+
429
+ combined_records_ext: List[Tuple[str, float, float, str, Dict[str, Any], float, float, float, float]] = []
430
 
431
  for cid in union_ids:
432
  if cid in sem_ids:
 
445
  text = sem_text if sem_text else bm25_text
446
  meta = sem_meta if sem_meta else bm25_meta
447
 
448
+ m_overlap = _meta_overlap(meta, q_terms) # lexical overlap
449
+ m_sem = _semantic_meta_overlap(meta, query_vec) # semantic overlap
450
+ intent_boost = _intent_weight(meta, user_intent)
451
+ act_wt = _action_weight(text, actions)
452
+
453
+ final_score = (
454
+ alpha * sem_sim +
455
+ beta * bm25_sim +
456
+ gamma * m_overlap +
457
+ zeta * m_sem +
458
+ delta * intent_boost +
459
+ epsilon * act_wt
460
+ )
 
461
  combined_records_ext.append(
462
+ (cid, final_score, (sem_dist if sem_dist is not None else 999.0), text, meta, m_overlap, intent_boost, act_wt, m_sem)
463
  )
464
 
 
465
  from collections import defaultdict
466
+ doc_groups: Dict[str, List[Tuple[str, float, float, str, Dict[str, Any], float, float, float, float]]] = defaultdict(list)
467
  for rec in combined_records_ext:
468
  meta = rec[4] or {}
469
  fn = meta.get("filename", "unknown")
470
  doc_groups[fn].append(rec)
471
 
472
+ def doc_prior(recs: List[Tuple[str, float, float, str, Dict[str, Any], float, float, float, float]]) -> float:
473
  total_score = sum(r[1] for r in recs)
474
  total_overlap = sum(r[5] for r in recs)
475
+ total_intent = sum(max(0.0, r[6]) for r in recs)
476
+ total_action = sum(max(0.0, r[7]) for r in recs)
477
+ total_sem_meta = sum(r[8] for r in recs)
478
+ total_penalty = sum(min(0.0, r[6]) for r in recs) + sum(min(0.0, r[7]) for r in recs)
479
+ return total_score + 0.4 * total_overlap + 0.6 * total_intent + 0.5 * total_action + 0.6 * total_sem_meta + 0.3 * total_penalty
480
 
481
  best_doc, best_doc_prior = None, -1.0
482
  for fn, recs in doc_groups.items():
 
491
  continue
492
  other_recs.extend(recs)
493
  other_recs.sort(key=lambda x: x[1], reverse=True)
 
494
  reordered = best_recs + other_recs
495
  top = reordered[:top_k]
 
496
  documents = [t[3] for t in top]
497
  metadatas = [t[4] for t in top]
498
  distances = [t[2] for t in top]
 
508
  "best_doc": best_doc,
509
  "best_doc_prior": best_doc_prior,
510
  "user_intent": user_intent,
511
+ "actions": actions,
512
  }
513
 
514
+ # ---------------------- Section fetch helpers -------------------------
515
+
516
  def get_section_text(filename: str, section: str) -> str:
517
  """Concatenate all chunk texts for a given filename+section."""
518
  texts: List[str] = []
 
524
  texts.append(t)
525
  return "\n\n".join(texts).strip()
526
 
527
+
528
  def get_best_steps_section_text(filename: str) -> str:
529
  """Return combined text of all 'steps' sections in the given SOP (filename)."""
530
  texts: List[str] = []
 
536
  texts.append(t)
537
  return "\n\n".join(texts).strip()
538
 
539
+ # ---------------------- Admin helpers --------------------------------
540
+
541
  def get_kb_runtime_info() -> Dict[str, Any]:
542
  return {
543
  "chroma_path": CHROMA_PATH,
 
548
  "bm25_ready": bm25_ready,
549
  }
550
 
551
+
552
  def reset_kb(folder_path: str) -> Dict[str, Any]:
553
  result = {"status": "OK", "message": "KB reset and re-ingested"}
554
  try: