srilakshu012456 commited on
Commit
c6f40fa
·
verified ·
1 Parent(s): 5e31c75

Update services/kb_creation.py

Browse files
Files changed (1) hide show
  1. services/kb_creation.py +189 -141
services/kb_creation.py CHANGED
@@ -1,33 +1,20 @@
1
-
2
- #!/usr/bin/env python3
3
- # -*- coding: utf-8 -*-
4
- """
5
- services/kb_creation.py
6
-
7
- Generic, meaning-aware intent & ranking:
8
- - Semantic intent classification (no keyword rules).
9
- - Hybrid score = semantic similarity + BM25 + lexical meta overlap + semantic meta overlap.
10
- - Chroma 'include' excludes 'ids'; IDs synthesized from metadata.
11
- """
12
-
13
  import os
14
  import re
15
  import pickle
16
- import math
17
  from typing import List, Dict, Any, Tuple, Optional
18
  from docx import Document
19
  from sentence_transformers import SentenceTransformer
20
  import chromadb
21
 
22
- # -------- ChromaDB --------
23
  CHROMA_PATH = os.path.join(os.getcwd(), "chroma_db")
24
  client = chromadb.PersistentClient(path=CHROMA_PATH)
25
  collection = client.get_or_create_collection(name="knowledge_base")
26
 
27
- # -------- Embeddings --------
28
  model = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2")
29
 
30
- # -------- BM25 (lightweight) --------
31
  BM25_INDEX_FILE = os.path.join(CHROMA_PATH, "bm25_index.pkl")
32
  bm25_docs: List[Dict[str, Any]] = []
33
  bm25_inverted: Dict[str, List[int]] = {}
@@ -37,26 +24,32 @@ bm25_ready: bool = False
37
  BM25_K1 = 1.5
38
  BM25_B = 0.75
39
 
40
- # -------- Utilities --------
41
  def _tokenize(text: str) -> List[str]:
42
- if not text: return []
 
43
  text = text.lower()
44
  return re.findall(r"[a-z0-9]+", text)
45
 
46
  def _normalize_query(q: str) -> str:
47
  q = (q or "").strip().lower()
48
  q = re.sub(r"[^\w\s]", " ", q)
49
- q = re.sub(r"\b(facing|get|getting|got|seeing|receiving|encountered|having|observing|issue|problem)\b", " ", q)
 
 
 
 
50
  q = re.sub(r"\s+", " ", q).strip()
51
  return q
52
 
53
  def _tokenize_meta_value(val: Optional[str]) -> List[str]:
54
  return _tokenize(val or "")
55
 
56
- # -------- DOCX parsing & chunking --------
57
  def _split_by_sections(doc: Document) -> List[Tuple[str, List[str]]]:
58
  sections: List[Tuple[str, List[str]]] = []
59
- current_title, current_paras = None, []
 
60
  for para in doc.paragraphs:
61
  text = (para.text or "").strip()
62
  style_name = (para.style.name if para.style else "") or ""
@@ -64,7 +57,8 @@ def _split_by_sections(doc: Document) -> List[Tuple[str, List[str]]]:
64
  if is_heading and text:
65
  if current_title or current_paras:
66
  sections.append((current_title or "Untitled Section", current_paras))
67
- current_title, current_paras = text, []
 
68
  else:
69
  if text:
70
  current_paras.append(text)
@@ -77,29 +71,39 @@ def _split_by_sections(doc: Document) -> List[Tuple[str, List[str]]]:
77
 
78
  def _chunk_text_with_context(doc_title: str, section_title: str, paragraphs: List[str], max_words: int = 900) -> List[str]:
79
  body = "\n".join(paragraphs).strip()
80
- if not body: return []
 
81
  words = body.split()
82
  chunks: List[str] = []
83
  for i in range(0, len(words), max_words):
84
  chunk_body = ' '.join(words[i:i + max_words]).strip()
85
  if chunk_body:
86
- chunks.append(chunk_body)
87
- if not chunks: chunks = [body]
 
88
  return chunks
89
 
90
- # -------- Intent tag from section (for metadata only) --------
91
  def _infer_intent_tag(section_title: str) -> str:
92
  st = (section_title or "").lower()
93
- if any(k in st for k in ["process steps", "procedure", "how to", "workflow", "instructions"]): return "steps"
94
- if any(k in st for k in ["common errors", "resolution", "troubleshooting"]): return "errors"
95
- if any(k in st for k in ["pre-requisites", "prerequisites"]): return "prereqs"
96
- if any(k in st for k in ["purpose", "overview", "introduction"]): return "purpose"
 
 
 
 
97
  return "neutral"
98
 
99
- # -------- Ingestion --------
100
  def ingest_documents(folder_path: str) -> None:
 
101
  files = [f for f in os.listdir(folder_path) if f.lower().endswith('.docx')]
102
- if not files: return
 
 
 
103
 
104
  global bm25_docs, bm25_inverted, bm25_df, bm25_avgdl, bm25_ready
105
  bm25_docs, bm25_inverted, bm25_df = [], {}, {}
@@ -110,8 +114,11 @@ def ingest_documents(folder_path: str) -> None:
110
  doc_title = os.path.splitext(file)[0]
111
  doc = Document(file_path)
112
  sections = _split_by_sections(doc)
 
 
113
  for s_idx, (section_title, paras) in enumerate(sections):
114
  chunks = _chunk_text_with_context(doc_title, section_title, paras, max_words=900)
 
115
  intent_tag = _infer_intent_tag(section_title)
116
  for c_idx, chunk in enumerate(chunks):
117
  embedding = model.encode(chunk).tolist()
@@ -122,7 +129,7 @@ def ingest_documents(folder_path: str) -> None:
122
  "chunk_index": c_idx,
123
  "title": doc_title,
124
  "collection": "SOP",
125
- "intent_tag": intent_tag,
126
  }
127
  try:
128
  collection.add(ids=[doc_id], embeddings=[embedding], documents=[chunk], metadatas=[meta])
@@ -130,14 +137,16 @@ def ingest_documents(folder_path: str) -> None:
130
  try:
131
  collection.delete(ids=[doc_id])
132
  collection.add(ids=[doc_id], embeddings=[embedding], documents=[chunk], metadatas=[meta])
133
- except Exception:
134
- pass
135
 
136
  tokens = _tokenize(chunk)
137
  tf: Dict[str, int] = {}
138
- for t in tokens: tf[t] = tf.get(t, 0) + 1
 
139
  idx = len(bm25_docs)
140
  bm25_docs.append({"id": doc_id, "text": chunk, "tokens": tokens, "tf": tf, "length": len(tokens), "meta": meta})
 
141
  seen = set()
142
  for term in tf.keys():
143
  bm25_inverted.setdefault(term, []).append(idx)
@@ -145,20 +154,31 @@ def ingest_documents(folder_path: str) -> None:
145
  bm25_df[term] = bm25_df.get(term, 0) + 1
146
  seen.add(term)
147
 
 
 
148
  N = len(bm25_docs)
149
  if N > 0:
150
  bm25_avgdl = sum(d["length"] for d in bm25_docs) / float(N)
151
  bm25_ready = True
152
 
153
- payload = {"bm25_docs": bm25_docs, "bm25_inverted": bm25_inverted, "bm25_df": bm25_df,
154
- "bm25_avgdl": bm25_avgdl, "BM25_K1": BM25_K1, "BM25_B": BM25_B}
 
 
 
 
 
 
155
  os.makedirs(CHROMA_PATH, exist_ok=True)
156
  with open(BM25_INDEX_FILE, "wb") as f:
157
  pickle.dump(payload, f)
 
 
158
 
159
  def _load_bm25_index() -> None:
160
  global bm25_docs, bm25_inverted, bm25_df, bm25_avgdl, bm25_ready
161
- if not os.path.exists(BM25_INDEX_FILE): return
 
162
  try:
163
  with open(BM25_INDEX_FILE, "rb") as f:
164
  payload = pickle.load(f)
@@ -167,114 +187,132 @@ def _load_bm25_index() -> None:
167
  bm25_df = payload.get("bm25_df", {})
168
  bm25_avgdl = payload.get("bm25_avgdl", 0.0)
169
  bm25_ready = len(bm25_docs) > 0
170
- except Exception:
171
- pass
 
 
172
 
173
  _load_bm25_index()
174
 
175
- # -------- BM25 search --------
176
  def _bm25_score_for_doc(query_terms: List[str], doc_idx: int) -> float:
177
- if not bm25_ready or doc_idx < 0 or doc_idx >= len(bm25_docs): return 0.0
 
178
  doc = bm25_docs[doc_idx]
179
- score, dl = 0.0, (doc["length"] or 1)
 
180
  for term in query_terms:
181
  df = bm25_df.get(term, 0)
182
- if df == 0: continue
 
183
  tf = doc["tf"].get(term, 0)
184
- if tf == 0: continue
 
185
  N = len(bm25_docs)
186
  idf_ratio = ((N - df + 0.5) / (df + 0.5))
187
- try: idf = math.log(idf_ratio + 1.0)
188
- except Exception: idf = 1.0
 
 
 
189
  denom = tf + BM25_K1 * (1 - BM25_B + BM25_B * (dl / (bm25_avgdl or 1.0)))
190
  score += idf * ((tf * (BM25_K1 + 1)) / (denom or 1.0))
191
  return score
192
 
193
  def bm25_search(query: str, top_k: int = 50) -> List[Tuple[int, float]]:
194
- if not bm25_ready: return []
 
195
  norm = _normalize_query(query)
196
  q_terms = _tokenize(norm)
197
- if not q_terms: return []
 
198
  candidates = set()
199
  for t in q_terms:
200
- for idx in bm25_inverted.get(t, []): candidates.add(idx)
201
- if not candidates: candidates = set(range(len(bm25_docs)))
 
 
202
  scored = []
203
  for idx in candidates:
204
  s = _bm25_score_for_doc(q_terms, idx)
205
- if s > 0: scored.append((idx, s))
 
206
  scored.sort(key=lambda x: x[1], reverse=True)
207
  return scored[:top_k]
208
 
209
- # -------- SAFE Chroma query and semantic-only --------
210
- def _safe_collection_query(query_embedding, top_k: int):
211
- base_include = ['documents', 'metadatas', 'distances'] # supported
212
- return collection.query(query_embeddings=[query_embedding], n_results=top_k, include=base_include)
213
-
214
  def search_knowledge_base(query: str, top_k: int = 10) -> dict:
215
  query_embedding = model.encode(query).tolist()
216
- res = _safe_collection_query(query_embedding, top_k)
217
- documents = (res.get("documents", [[]]) or [[]])[0]
218
- metadatas = (res.get("metadatas", [[]]) or [[]])[0]
219
- distances = (res.get("distances", [[]]) or [[]])[0]
220
- # synthesize ids
221
- ids = []
222
- for i, m in enumerate(metadatas):
223
- fn = (m or {}).get("filename", "unknown")
224
- sec = (m or {}).get("section", "section")
225
- idx = (m or {}).get("chunk_index", i)
226
- ids.append(f"{fn}:{sec}:{idx}")
227
- return {"documents": documents, "metadatas": metadatas, "distances": distances, "ids": ids}
228
-
229
- # -------- Semantic intent + Hybrid ranking --------
230
- INTENT_PROTOTYPES = {
231
- "steps": ["how to perform", "procedure", "workflow", "instructions",
232
- "steps to accomplish", "operate", "process to follow"],
233
- "errors": ["error condition", "issue troubleshooting", "resolution steps",
234
- "fix failure", "diagnose problem",
235
- # NEW: permission/authorization/role access signals
236
- "permission denied", "not authorized", "authorization required",
237
- "role access missing", "access not allowed", "insufficient privileges"],
238
- "prereqs": ["pre-requisites", "requirements before starting", "setup needed"],
239
- "purpose": ["overview", "purpose", "introduction", "what is this about"],
240
- "neutral": ["general information", "context", "details"],
241
- }
242
- INTENT_PROTO_VECS = {name: model.encode(" ; ".join(phrases)).tolist() for name, phrases in INTENT_PROTOTYPES.items()}
243
-
244
- def _cosine(a: list, b: list) -> float:
245
- if not a or not b or len(a) != len(b): return 0.0
246
- dot = sum(x*y for x, y in zip(a, b))
247
- na = math.sqrt(sum(x*x for x in a)) or 1.0
248
- nb = math.sqrt(sum(y*y for y in b)) or 1.0
249
- return dot / (na * nb)
250
-
251
- def classify_intent_semantic(query: str, min_margin: float = 0.08) -> str:
252
- qv = model.encode((query or "").strip()).tolist()
253
- scores = {name: _cosine(qv, vec) for name, vec in INTENT_PROTO_VECS.items()}
254
- best = max(scores.items(), key=lambda kv: kv[1])
255
- second = sorted(scores.values(), reverse=True)[1] if len(scores) > 1 else 0.0
256
- if best[1] - second >= min_margin: return best[0] if best[0] != "neutral" else "neutral"
257
- return "neutral"
258
 
 
259
  ACTION_SYNONYMS = {
260
  "create": ["create", "creation", "add", "new", "generate"],
261
  "update": ["update", "modify", "change", "edit"],
262
  "delete": ["delete", "remove"],
263
  "navigate": ["navigate", "go to", "open"],
 
264
  }
265
 
 
 
 
 
 
 
 
 
 
 
 
 
266
  def _extract_actions(query: str) -> List[str]:
267
  q = (query or "").lower()
268
  found = []
269
  for act, syns in ACTION_SYNONYMS.items():
270
- if any(s in q for s in syns): found.append(act)
 
271
  return found or []
272
 
273
  def _intent_weight(meta: dict, user_intent: str) -> float:
274
  tag = (meta or {}).get("intent_tag", "neutral")
275
- if user_intent == "neutral": return 0.0
276
- if tag == user_intent: return 1.0
277
- if tag in ["purpose", "prereqs"] and user_intent in ["steps", "errors"]: return -0.6
 
 
 
278
  return -0.2
279
 
280
  def _meta_overlap(meta: Dict[str, Any], q_terms: List[str]) -> float:
@@ -282,38 +320,34 @@ def _meta_overlap(meta: Dict[str, Any], q_terms: List[str]) -> float:
282
  title_tokens = _tokenize_meta_value(meta.get("title"))
283
  section_tokens = _tokenize_meta_value(meta.get("section"))
284
  meta_tokens = set(fn_tokens + title_tokens + section_tokens)
285
- if not meta_tokens or not q_terms: return 0.0
 
286
  qset = set(q_terms)
287
  inter = len(meta_tokens & qset)
288
  return inter / max(1, len(qset))
289
 
290
- def _semantic_meta_overlap(meta: Dict[str, Any], query_vec: List[float]) -> float:
291
- s = " ".join([str(meta.get("filename", "")), str(meta.get("title", "")), str(meta.get("section", ""))]).strip()
292
- if not s: return 0.0
293
- mv = model.encode(s).tolist()
294
- return max(0.0, _cosine(query_vec, mv))
295
-
296
  def _action_weight(text: str, actions: List[str]) -> float:
297
- if not actions: return 0.0
 
298
  t = (text or "").lower()
299
  score = 0.0
300
  for act in actions:
301
  for syn in ACTION_SYNONYMS.get(act, [act]):
302
- if syn in t: score += 1.0
 
303
  conflicts = {"create": ["delete"], "delete": ["create"], "update": ["delete"], "navigate": []}
304
  for act in actions:
305
  for bad in conflicts.get(act, []):
306
  for syn in ACTION_SYNONYMS.get(bad, [bad]):
307
- if syn in t: score -= 0.8
 
308
  return score
309
 
310
  def hybrid_search_knowledge_base(query: str, top_k: int = 10, alpha: float = 0.6, beta: float = 0.4) -> dict:
311
  norm_query = _normalize_query(query)
312
  q_terms = _tokenize(norm_query)
313
-
314
- user_intent = classify_intent_semantic(query) # semantic intent
315
  actions = _extract_actions(query)
316
- query_vec = model.encode(norm_query).tolist()
317
 
318
  sem_res = search_knowledge_base(norm_query, top_k=max(top_k, 30))
319
  sem_docs = sem_res.get("documents", [])
@@ -322,15 +356,19 @@ def hybrid_search_knowledge_base(query: str, top_k: int = 10, alpha: float = 0.6
322
  sem_ids = sem_res.get("ids", [])
323
 
324
  def dist_to_sim(d: Optional[float]) -> float:
325
- if d is None: return 0.0
326
- try: return 1.0 / (1.0 + float(d))
327
- except Exception: return 0.0
 
 
 
328
 
329
  sem_sims = [dist_to_sim(d) for d in sem_dists]
330
 
331
  bm25_hits = bm25_search(norm_query, top_k=max(50, top_k * 5))
332
  bm25_max = max([s for _, s in bm25_hits], default=1.0)
333
  bm25_norm_pairs = [(idx, (score / bm25_max) if bm25_max > 0 else 0.0) for idx, score in bm25_hits]
 
334
  bm25_id_to_norm, bm25_id_to_text, bm25_id_to_meta = {}, {}, {}
335
  for idx, nscore in bm25_norm_pairs:
336
  d = bm25_docs[idx]
@@ -340,13 +378,11 @@ def hybrid_search_knowledge_base(query: str, top_k: int = 10, alpha: float = 0.6
340
 
341
  union_ids = set(sem_ids) | set(bm25_id_to_norm.keys())
342
 
343
- # weights
344
- gamma = 0.25 # lexical meta overlap
345
- delta = 0.35 # intent tag vs user intent
346
- epsilon = 0.25 # action weight
347
- zeta = 0.35 # semantic meta similarity (NEW)
348
 
349
- combined_records_ext: List[Tuple[str, float, float, str, Dict[str, Any], float, float, float, float]] = []
350
  for cid in union_ids:
351
  if cid in sem_ids:
352
  pos = sem_ids.index(cid)
@@ -365,28 +401,29 @@ def hybrid_search_knowledge_base(query: str, top_k: int = 10, alpha: float = 0.6
365
  meta = sem_meta if sem_meta else bm25_meta
366
 
367
  m_overlap = _meta_overlap(meta, q_terms)
368
- m_sem = _semantic_meta_overlap(meta, query_vec) # NEW semantic meta
369
  intent_boost = _intent_weight(meta, user_intent)
370
  act_wt = _action_weight(text, actions)
371
 
372
- final_score = alpha * sem_sim + beta * bm25_sim + gamma * m_overlap + zeta * m_sem + delta * intent_boost + epsilon * act_wt
373
- combined_records_ext.append((cid, final_score, (sem_dist if sem_dist is not None else 999.0), text, meta, m_overlap, intent_boost, act_wt, m_sem))
 
 
 
374
 
375
  from collections import defaultdict
376
- doc_groups: Dict[str, List[Tuple[str, float, float, str, Dict[str, Any], float, float, float, float]]] = defaultdict(list)
377
  for rec in combined_records_ext:
378
  meta = rec[4] or {}
379
  fn = meta.get("filename", "unknown")
380
  doc_groups[fn].append(rec)
381
 
382
- def doc_prior(recs: List[Tuple[str, float, float, str, Dict[str, Any], float, float, float, float]]) -> float:
383
  total_score = sum(r[1] for r in recs)
384
  total_overlap = sum(r[5] for r in recs)
385
  total_intent = sum(max(0.0, r[6]) for r in recs)
386
  total_action = sum(max(0.0, r[7]) for r in recs)
387
- total_sem_meta = sum(r[8] for r in recs)
388
  total_penalty = sum(min(0.0, r[6]) for r in recs) + sum(min(0.0, r[7]) for r in recs)
389
- return total_score + 0.4 * total_overlap + 0.6 * total_intent + 0.5 * total_action + 0.6 * total_sem_meta + 0.3 * total_penalty
390
 
391
  best_doc, best_doc_prior = None, -1.0
392
  for fn, recs in doc_groups.items():
@@ -397,11 +434,14 @@ def hybrid_search_knowledge_base(query: str, top_k: int = 10, alpha: float = 0.6
397
  best_recs = sorted(doc_groups.get(best_doc, []), key=lambda x: x[1], reverse=True)
398
  other_recs = []
399
  for fn, recs in doc_groups.items():
400
- if fn == best_doc: continue
 
401
  other_recs.extend(recs)
402
  other_recs.sort(key=lambda x: x[1], reverse=True)
 
403
  reordered = best_recs + other_recs
404
  top = reordered[:top_k]
 
405
  documents = [t[3] for t in top]
406
  metadatas = [t[4] for t in top]
407
  distances = [t[2] for t in top]
@@ -420,26 +460,30 @@ def hybrid_search_knowledge_base(query: str, top_k: int = 10, alpha: float = 0.6
420
  "actions": actions,
421
  }
422
 
423
- # -------- Section helpers --------
424
  def get_section_text(filename: str, section: str) -> str:
 
425
  texts: List[str] = []
426
  for d in bm25_docs:
427
  m = d.get("meta", {})
428
  if m.get("filename") == filename and m.get("section") == section:
429
  t = (d.get("text") or "").strip()
430
- if t: texts.append(t)
 
431
  return "\n\n".join(texts).strip()
432
 
433
  def get_best_steps_section_text(filename: str) -> str:
 
434
  texts: List[str] = []
435
  for d in bm25_docs:
436
  m = d.get("meta", {})
437
  if m.get("filename") == filename and (m.get("intent_tag") == "steps"):
438
  t = (d.get("text") or "").strip()
439
- if t: texts.append(t)
 
440
  return "\n\n".join(texts).strip()
441
 
442
- # -------- Admin --------
443
  def get_kb_runtime_info() -> Dict[str, Any]:
444
  return {
445
  "chroma_path": CHROMA_PATH,
@@ -453,12 +497,15 @@ def get_kb_runtime_info() -> Dict[str, Any]:
453
  def reset_kb(folder_path: str) -> Dict[str, Any]:
454
  result = {"status": "OK", "message": "KB reset and re-ingested"}
455
  try:
456
- try: client.delete_collection(name="knowledge_base")
457
- except Exception: pass
 
 
458
  global collection
459
  collection = client.get_or_create_collection(name="knowledge_base")
460
  try:
461
- if os.path.isfile(BM25_INDEX_FILE): os.remove(BM25_INDEX_FILE)
 
462
  except Exception as e:
463
  result.setdefault("warnings", []).append(f"bm25 index delete: {e}")
464
  os.makedirs(CHROMA_PATH, exist_ok=True)
@@ -467,3 +514,4 @@ def reset_kb(folder_path: str) -> Dict[str, Any]:
467
  return result
468
  except Exception as e:
469
  return {"status": "ERROR", "error": f"{e}", "info": get_kb_runtime_info()}
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  import os
2
  import re
3
  import pickle
 
4
  from typing import List, Dict, Any, Tuple, Optional
5
  from docx import Document
6
  from sentence_transformers import SentenceTransformer
7
  import chromadb
8
 
9
+ # --------------------------- ChromaDB setup ---------------------------
10
  CHROMA_PATH = os.path.join(os.getcwd(), "chroma_db")
11
  client = chromadb.PersistentClient(path=CHROMA_PATH)
12
  collection = client.get_or_create_collection(name="knowledge_base")
13
 
14
+ # --------------------------- Embedding model ---------------------------
15
  model = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2")
16
 
17
+ # --------------------------- BM25 (lightweight) ---------------------------
18
  BM25_INDEX_FILE = os.path.join(CHROMA_PATH, "bm25_index.pkl")
19
  bm25_docs: List[Dict[str, Any]] = []
20
  bm25_inverted: Dict[str, List[int]] = {}
 
24
  BM25_K1 = 1.5
25
  BM25_B = 0.75
26
 
27
+ # --------------------------- Utilities ---------------------------
28
  def _tokenize(text: str) -> List[str]:
29
+ if not text:
30
+ return []
31
  text = text.lower()
32
  return re.findall(r"[a-z0-9]+", text)
33
 
34
  def _normalize_query(q: str) -> str:
35
  q = (q or "").strip().lower()
36
  q = re.sub(r"[^\w\s]", " ", q)
37
+ q = re.sub(
38
+ r"\b(facing|get|getting|got|seeing|receiving|encountered|having|observing|issue|problem)\b",
39
+ " ",
40
+ q,
41
+ )
42
  q = re.sub(r"\s+", " ", q).strip()
43
  return q
44
 
45
  def _tokenize_meta_value(val: Optional[str]) -> List[str]:
46
  return _tokenize(val or "")
47
 
48
+ # --------------------------- DOCX parsing & chunking ---------------------------
49
  def _split_by_sections(doc: Document) -> List[Tuple[str, List[str]]]:
50
  sections: List[Tuple[str, List[str]]] = []
51
+ current_title = None
52
+ current_paras: List[str] = []
53
  for para in doc.paragraphs:
54
  text = (para.text or "").strip()
55
  style_name = (para.style.name if para.style else "") or ""
 
57
  if is_heading and text:
58
  if current_title or current_paras:
59
  sections.append((current_title or "Untitled Section", current_paras))
60
+ current_title = text
61
+ current_paras = []
62
  else:
63
  if text:
64
  current_paras.append(text)
 
71
 
72
  def _chunk_text_with_context(doc_title: str, section_title: str, paragraphs: List[str], max_words: int = 900) -> List[str]:
73
  body = "\n".join(paragraphs).strip()
74
+ if not body:
75
+ return []
76
  words = body.split()
77
  chunks: List[str] = []
78
  for i in range(0, len(words), max_words):
79
  chunk_body = ' '.join(words[i:i + max_words]).strip()
80
  if chunk_body:
81
+ chunks.append(chunk_body) # no doc/section headers inside text
82
+ if not chunks:
83
+ chunks = [body]
84
  return chunks
85
 
86
+ # --------------------------- Intent tagging (auto) ---------------------------
87
  def _infer_intent_tag(section_title: str) -> str:
88
  st = (section_title or "").lower()
89
+ if any(k in st for k in ["process steps", "procedure", "how to", "workflow", "instructions"]):
90
+ return "steps"
91
+ if any(k in st for k in ["common errors", "resolution", "troubleshooting"]):
92
+ return "errors"
93
+ if any(k in st for k in ["pre-requisites", "prerequisites"]):
94
+ return "prereqs"
95
+ if any(k in st for k in ["purpose", "overview", "introduction"]):
96
+ return "purpose"
97
  return "neutral"
98
 
99
+ # --------------------------- Ingestion ---------------------------
100
  def ingest_documents(folder_path: str) -> None:
101
+ print(f"📂 Checking folder: {folder_path}")
102
  files = [f for f in os.listdir(folder_path) if f.lower().endswith('.docx')]
103
+ print(f"Found {len(files)} Word files: {files}")
104
+ if not files:
105
+ print("⚠️ No .docx files found. Please check the folder path.")
106
+ return
107
 
108
  global bm25_docs, bm25_inverted, bm25_df, bm25_avgdl, bm25_ready
109
  bm25_docs, bm25_inverted, bm25_df = [], {}, {}
 
114
  doc_title = os.path.splitext(file)[0]
115
  doc = Document(file_path)
116
  sections = _split_by_sections(doc)
117
+ total_chunks = 0
118
+
119
  for s_idx, (section_title, paras) in enumerate(sections):
120
  chunks = _chunk_text_with_context(doc_title, section_title, paras, max_words=900)
121
+ total_chunks += len(chunks)
122
  intent_tag = _infer_intent_tag(section_title)
123
  for c_idx, chunk in enumerate(chunks):
124
  embedding = model.encode(chunk).tolist()
 
129
  "chunk_index": c_idx,
130
  "title": doc_title,
131
  "collection": "SOP",
132
+ "intent_tag": intent_tag, # NEW
133
  }
134
  try:
135
  collection.add(ids=[doc_id], embeddings=[embedding], documents=[chunk], metadatas=[meta])
 
137
  try:
138
  collection.delete(ids=[doc_id])
139
  collection.add(ids=[doc_id], embeddings=[embedding], documents=[chunk], metadatas=[meta])
140
+ except Exception as e2:
141
+ print(f"❌ Upsert failed for {doc_id}: {e2}")
142
 
143
  tokens = _tokenize(chunk)
144
  tf: Dict[str, int] = {}
145
+ for t in tokens:
146
+ tf[t] = tf.get(t, 0) + 1
147
  idx = len(bm25_docs)
148
  bm25_docs.append({"id": doc_id, "text": chunk, "tokens": tokens, "tf": tf, "length": len(tokens), "meta": meta})
149
+
150
  seen = set()
151
  for term in tf.keys():
152
  bm25_inverted.setdefault(term, []).append(idx)
 
154
  bm25_df[term] = bm25_df.get(term, 0) + 1
155
  seen.add(term)
156
 
157
+ print(f"📄 Ingested {file} → {total_chunks} chunks")
158
+
159
  N = len(bm25_docs)
160
  if N > 0:
161
  bm25_avgdl = sum(d["length"] for d in bm25_docs) / float(N)
162
  bm25_ready = True
163
 
164
+ payload = {
165
+ "bm25_docs": bm25_docs,
166
+ "bm25_inverted": bm25_inverted,
167
+ "bm25_df": bm25_df,
168
+ "bm25_avgdl": bm25_avgdl,
169
+ "BM25_K1": BM25_K1,
170
+ "BM25_B": BM25_B,
171
+ }
172
  os.makedirs(CHROMA_PATH, exist_ok=True)
173
  with open(BM25_INDEX_FILE, "wb") as f:
174
  pickle.dump(payload, f)
175
+ print(f"✅ BM25 index saved: {BM25_INDEX_FILE}")
176
+ print(f"✅ Documents ingested. Total entries in Chroma: {collection.count()}")
177
 
178
  def _load_bm25_index() -> None:
179
  global bm25_docs, bm25_inverted, bm25_df, bm25_avgdl, bm25_ready
180
+ if not os.path.exists(BM25_INDEX_FILE):
181
+ return
182
  try:
183
  with open(BM25_INDEX_FILE, "rb") as f:
184
  payload = pickle.load(f)
 
187
  bm25_df = payload.get("bm25_df", {})
188
  bm25_avgdl = payload.get("bm25_avgdl", 0.0)
189
  bm25_ready = len(bm25_docs) > 0
190
+ if bm25_ready:
191
+ print(f"✅ BM25 index loaded: {BM25_INDEX_FILE} (docs={len(bm25_docs)})")
192
+ except Exception as e:
193
+ print(f"⚠️ Could not load BM25 index: {e}")
194
 
195
  _load_bm25_index()
196
 
197
+ # --------------------------- BM25 search ---------------------------
198
  def _bm25_score_for_doc(query_terms: List[str], doc_idx: int) -> float:
199
+ if not bm25_ready or doc_idx < 0 or doc_idx >= len(bm25_docs):
200
+ return 0.0
201
  doc = bm25_docs[doc_idx]
202
+ score = 0.0
203
+ dl = doc["length"] or 1
204
  for term in query_terms:
205
  df = bm25_df.get(term, 0)
206
+ if df == 0:
207
+ continue
208
  tf = doc["tf"].get(term, 0)
209
+ if tf == 0:
210
+ continue
211
  N = len(bm25_docs)
212
  idf_ratio = ((N - df + 0.5) / (df + 0.5))
213
+ try:
214
+ import math
215
+ idf = math.log(idf_ratio + 1.0)
216
+ except Exception:
217
+ idf = 1.0
218
  denom = tf + BM25_K1 * (1 - BM25_B + BM25_B * (dl / (bm25_avgdl or 1.0)))
219
  score += idf * ((tf * (BM25_K1 + 1)) / (denom or 1.0))
220
  return score
221
 
222
  def bm25_search(query: str, top_k: int = 50) -> List[Tuple[int, float]]:
223
+ if not bm25_ready:
224
+ return []
225
  norm = _normalize_query(query)
226
  q_terms = _tokenize(norm)
227
+ if not q_terms:
228
+ return []
229
  candidates = set()
230
  for t in q_terms:
231
+ for idx in bm25_inverted.get(t, []):
232
+ candidates.add(idx)
233
+ if not candidates:
234
+ candidates = set(range(len(bm25_docs)))
235
  scored = []
236
  for idx in candidates:
237
  s = _bm25_score_for_doc(q_terms, idx)
238
+ if s > 0:
239
+ scored.append((idx, s))
240
  scored.sort(key=lambda x: x[1], reverse=True)
241
  return scored[:top_k]
242
 
243
+ # --------------------------- Semantic-only ---------------------------
 
 
 
 
244
  def search_knowledge_base(query: str, top_k: int = 10) -> dict:
245
  query_embedding = model.encode(query).tolist()
246
+ res = collection.query(
247
+ query_embeddings=[query_embedding],
248
+ n_results=top_k,
249
+ include=['documents', 'metadatas', 'distances']
250
+ )
251
+ docs_ll = res.get("documents", [[]]) or [[]]
252
+ metas_ll = res.get("metadatas", [[]]) or [[]]
253
+ dists_ll = res.get("distances", [[]]) or [[]]
254
+ ids_ll = res.get("ids", [[]]) or [[]]
255
+
256
+ documents = docs_ll[0] if docs_ll else []
257
+ metadatas = metas_ll[0] if metas_ll else []
258
+ distances = dists_ll[0] if dists_ll else []
259
+ ids = ids_ll[0] if ids_ll else []
260
+
261
+ if not ids and documents:
262
+ synthesized = []
263
+ for i, m in enumerate(metadatas):
264
+ fn = (m or {}).get("filename", "unknown")
265
+ sec = (m or {}).get("section", "section")
266
+ idx = (m or {}).get("chunk_index", i)
267
+ synthesized.append(f"{fn}:{sec}:{idx}")
268
+ ids = synthesized
269
+
270
+ print(f"🔎 KB search {len(documents)} docs (top_k={top_k}); "
271
+ f"first distance: {distances[0] if distances else 'n/a'}; ids={len(ids)}")
272
+ return {
273
+ "documents": documents,
274
+ "metadatas": metadatas,
275
+ "distances": distances,
276
+ "ids": ids,
277
+ }
 
 
 
 
 
 
 
 
 
 
278
 
279
+ # --------------------------- Hybrid (BM25 + Embeddings + Intent + Action) ---------------------------
280
  ACTION_SYNONYMS = {
281
  "create": ["create", "creation", "add", "new", "generate"],
282
  "update": ["update", "modify", "change", "edit"],
283
  "delete": ["delete", "remove"],
284
  "navigate": ["navigate", "go to", "open"],
285
+ # NOTE: 'perform' REMOVED to avoid wrong boosts like Appointment "performed..."
286
  }
287
 
288
+ def _detect_user_intent(query: str) -> str:
289
+ q = (query or "").lower()
290
+ if any(k in q for k in ["steps", "procedure", "how to", "navigate", "perform", "do", "process"]):
291
+ return "steps"
292
+ if any(k in q for k in ["error", "issue", "fail", "not working", "resolution", "fix"]):
293
+ return "errors"
294
+ if any(k in q for k in ["pre-requisite", "prerequisites", "requirement", "requirements"]):
295
+ return "prereqs"
296
+ if any(k in q for k in ["purpose", "overview", "introduction"]):
297
+ return "purpose"
298
+ return "neutral"
299
+
300
  def _extract_actions(query: str) -> List[str]:
301
  q = (query or "").lower()
302
  found = []
303
  for act, syns in ACTION_SYNONYMS.items():
304
+ if any(s in q for s in syns):
305
+ found.append(act)
306
  return found or []
307
 
308
  def _intent_weight(meta: dict, user_intent: str) -> float:
309
  tag = (meta or {}).get("intent_tag", "neutral")
310
+ if user_intent == "neutral":
311
+ return 0.0
312
+ if tag == user_intent:
313
+ return 1.0
314
+ if tag in ["purpose", "prereqs"] and user_intent in ["steps", "errors"]:
315
+ return -0.6
316
  return -0.2
317
 
318
  def _meta_overlap(meta: Dict[str, Any], q_terms: List[str]) -> float:
 
320
  title_tokens = _tokenize_meta_value(meta.get("title"))
321
  section_tokens = _tokenize_meta_value(meta.get("section"))
322
  meta_tokens = set(fn_tokens + title_tokens + section_tokens)
323
+ if not meta_tokens or not q_terms:
324
+ return 0.0
325
  qset = set(q_terms)
326
  inter = len(meta_tokens & qset)
327
  return inter / max(1, len(qset))
328
 
 
 
 
 
 
 
329
  def _action_weight(text: str, actions: List[str]) -> float:
330
+ if not actions:
331
+ return 0.0
332
  t = (text or "").lower()
333
  score = 0.0
334
  for act in actions:
335
  for syn in ACTION_SYNONYMS.get(act, [act]):
336
+ if syn in t:
337
+ score += 1.0
338
  conflicts = {"create": ["delete"], "delete": ["create"], "update": ["delete"], "navigate": []}
339
  for act in actions:
340
  for bad in conflicts.get(act, []):
341
  for syn in ACTION_SYNONYMS.get(bad, [bad]):
342
+ if syn in t:
343
+ score -= 0.8
344
  return score
345
 
346
  def hybrid_search_knowledge_base(query: str, top_k: int = 10, alpha: float = 0.6, beta: float = 0.4) -> dict:
347
  norm_query = _normalize_query(query)
348
  q_terms = _tokenize(norm_query)
349
+ user_intent = _detect_user_intent(query)
 
350
  actions = _extract_actions(query)
 
351
 
352
  sem_res = search_knowledge_base(norm_query, top_k=max(top_k, 30))
353
  sem_docs = sem_res.get("documents", [])
 
356
  sem_ids = sem_res.get("ids", [])
357
 
358
  def dist_to_sim(d: Optional[float]) -> float:
359
+ if d is None:
360
+ return 0.0
361
+ try:
362
+ return 1.0 / (1.0 + float(d))
363
+ except Exception:
364
+ return 0.0
365
 
366
  sem_sims = [dist_to_sim(d) for d in sem_dists]
367
 
368
  bm25_hits = bm25_search(norm_query, top_k=max(50, top_k * 5))
369
  bm25_max = max([s for _, s in bm25_hits], default=1.0)
370
  bm25_norm_pairs = [(idx, (score / bm25_max) if bm25_max > 0 else 0.0) for idx, score in bm25_hits]
371
+
372
  bm25_id_to_norm, bm25_id_to_text, bm25_id_to_meta = {}, {}, {}
373
  for idx, nscore in bm25_norm_pairs:
374
  d = bm25_docs[idx]
 
378
 
379
  union_ids = set(sem_ids) | set(bm25_id_to_norm.keys())
380
 
381
+ gamma = 0.25 # meta overlap
382
+ delta = 0.35 # intent boost
383
+ epsilon = 0.30 # action weight
 
 
384
 
385
+ combined_records_ext: List[Tuple[str, float, float, str, Dict[str, Any], float, float, float]] = []
386
  for cid in union_ids:
387
  if cid in sem_ids:
388
  pos = sem_ids.index(cid)
 
401
  meta = sem_meta if sem_meta else bm25_meta
402
 
403
  m_overlap = _meta_overlap(meta, q_terms)
 
404
  intent_boost = _intent_weight(meta, user_intent)
405
  act_wt = _action_weight(text, actions)
406
 
407
+ final_score = alpha * sem_sim + beta * bm25_sim + gamma * m_overlap + delta * intent_boost + epsilon * act_wt
408
+
409
+ combined_records_ext.append(
410
+ (cid, final_score, (sem_dist if sem_dist is not None else 999.0), text, meta, m_overlap, intent_boost, act_wt)
411
+ )
412
 
413
  from collections import defaultdict
414
+ doc_groups: Dict[str, List[Tuple[str, float, float, str, Dict[str, Any], float, float, float]]] = defaultdict(list)
415
  for rec in combined_records_ext:
416
  meta = rec[4] or {}
417
  fn = meta.get("filename", "unknown")
418
  doc_groups[fn].append(rec)
419
 
420
+ def doc_prior(recs: List[Tuple[str, float, float, str, Dict[str, Any], float, float, float]]) -> float:
421
  total_score = sum(r[1] for r in recs)
422
  total_overlap = sum(r[5] for r in recs)
423
  total_intent = sum(max(0.0, r[6]) for r in recs)
424
  total_action = sum(max(0.0, r[7]) for r in recs)
 
425
  total_penalty = sum(min(0.0, r[6]) for r in recs) + sum(min(0.0, r[7]) for r in recs)
426
+ return total_score + 0.4 * total_overlap + 0.6 * total_intent + 0.5 * total_action + 0.3 * total_penalty
427
 
428
  best_doc, best_doc_prior = None, -1.0
429
  for fn, recs in doc_groups.items():
 
434
  best_recs = sorted(doc_groups.get(best_doc, []), key=lambda x: x[1], reverse=True)
435
  other_recs = []
436
  for fn, recs in doc_groups.items():
437
+ if fn == best_doc:
438
+ continue
439
  other_recs.extend(recs)
440
  other_recs.sort(key=lambda x: x[1], reverse=True)
441
+
442
  reordered = best_recs + other_recs
443
  top = reordered[:top_k]
444
+
445
  documents = [t[3] for t in top]
446
  metadatas = [t[4] for t in top]
447
  distances = [t[2] for t in top]
 
460
  "actions": actions,
461
  }
462
 
463
+ # --------------------------- Section fetch helpers (for full output) ---------------------------
464
  def get_section_text(filename: str, section: str) -> str:
465
+ """Concatenate all chunk texts for a given filename+section."""
466
  texts: List[str] = []
467
  for d in bm25_docs:
468
  m = d.get("meta", {})
469
  if m.get("filename") == filename and m.get("section") == section:
470
  t = (d.get("text") or "").strip()
471
+ if t:
472
+ texts.append(t)
473
  return "\n\n".join(texts).strip()
474
 
475
  def get_best_steps_section_text(filename: str) -> str:
476
+ """Return combined text of all 'steps' sections in the given SOP (filename)."""
477
  texts: List[str] = []
478
  for d in bm25_docs:
479
  m = d.get("meta", {})
480
  if m.get("filename") == filename and (m.get("intent_tag") == "steps"):
481
  t = (d.get("text") or "").strip()
482
+ if t:
483
+ texts.append(t)
484
  return "\n\n".join(texts).strip()
485
 
486
+ # --- Admin helpers (optional; unchanged) ---
487
  def get_kb_runtime_info() -> Dict[str, Any]:
488
  return {
489
  "chroma_path": CHROMA_PATH,
 
497
  def reset_kb(folder_path: str) -> Dict[str, Any]:
498
  result = {"status": "OK", "message": "KB reset and re-ingested"}
499
  try:
500
+ try:
501
+ client.delete_collection(name="knowledge_base")
502
+ except Exception:
503
+ pass
504
  global collection
505
  collection = client.get_or_create_collection(name="knowledge_base")
506
  try:
507
+ if os.path.isfile(BM25_INDEX_FILE):
508
+ os.remove(BM25_INDEX_FILE)
509
  except Exception as e:
510
  result.setdefault("warnings", []).append(f"bm25 index delete: {e}")
511
  os.makedirs(CHROMA_PATH, exist_ok=True)
 
514
  return result
515
  except Exception as e:
516
  return {"status": "ERROR", "error": f"{e}", "info": get_kb_runtime_info()}
517
+