srilakshu012456 commited on
Commit
6f1a758
Β·
verified Β·
1 Parent(s): 97a9e9a

Create kb_creation.py

Browse files
Files changed (1) hide show
  1. kb_creation.py +433 -0
kb_creation.py ADDED
@@ -0,0 +1,433 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+ import os
3
+ import re
4
+ import pickle
5
+ from typing import List, Dict, Any, Tuple, Optional
6
+ from docx import Document
7
+ from sentence_transformers import SentenceTransformer
8
+ import chromadb
9
+
10
+ # ------------------------- ChromaDB setup -------------------------
11
+ CHROMA_PATH = os.path.join(os.getcwd(), "chroma_db")
12
+ client = chromadb.PersistentClient(path=CHROMA_PATH)
13
+ collection = client.get_or_create_collection(name="knowledge_base")
14
+
15
+ # ------------------------- Embedding model ------------------------
16
+ # You can swap to a multilingual model if you expect mixed language queries:
17
+ # model = SentenceTransformer('paraphrase-multilingual-MiniLM-L12-v2')
18
+ MODEL_PATH = './models/all-MiniLM-L6-v2'
19
+ model = SentenceTransformer(MODEL_PATH)
20
+
21
+ # ------------------------- BM25 (lightweight) ---------------------
22
+ BM25_INDEX_FILE = os.path.join(CHROMA_PATH, "bm25_index.pkl")
23
+
24
+ bm25_docs: List[Dict[str, Any]] = [] # each: {id, text, tokens, tf, length, meta}
25
+ bm25_inverted: Dict[str, List[int]] = {} # term -> list of doc indices in bm25_docs
26
+ bm25_df: Dict[str, int] = {} # term -> document frequency
27
+ bm25_avgdl: float = 0.0
28
+ bm25_ready: bool = False
29
+ BM25_K1 = 1.5
30
+ BM25_B = 0.75
31
+
32
+ # ------------------------- Utilities ------------------------------
33
+ def _tokenize(text: str) -> List[str]:
34
+ """
35
+ Simple tokenizer: lowercase alphanumeric words; removes most punctuation.
36
+ Keeps stopwords (BM25 can work with them), but normalizes whitespace.
37
+ """
38
+ if not text:
39
+ return []
40
+ text = text.lower()
41
+ tokens = re.findall(r"[a-z0-9]+", text)
42
+ return tokens
43
+
44
+ def _normalize_query(q: str) -> str:
45
+ """
46
+ Language-agnostic normalization for user queries (no hardcoded domain synonyms).
47
+ Removes filler verbs, collapses whitespace, lowercases, keeps key terms.
48
+ """
49
+ q = (q or "").strip().lower()
50
+ q = re.sub(r"[^\w\s]", " ", q)
51
+ # remove generic filler verbs/common noise words across English variants
52
+ q = re.sub(r"\b(facing|get|getting|got|seeing|receiving|encountered|having|observing|issue|problem)\b", " ", q)
53
+ q = re.sub(r"\s+", " ", q).strip()
54
+ return q
55
+
56
+ # ------------------------- DOCX parsing & chunking ----------------
57
+ def _split_by_sections(doc: Document) -> List[Tuple[str, List[str]]]:
58
+ """
59
+ Split DOCX into (section_title, paragraphs_in_section).
60
+ Uses paragraph style names: 'Heading 1', 'Heading 2', etc.
61
+ Falls back to document-level when no headings are present.
62
+ """
63
+ sections: List[Tuple[str, List[str]]] = []
64
+ current_title = None
65
+ current_paras: List[str] = []
66
+
67
+ for para in doc.paragraphs:
68
+ text = (para.text or "").strip()
69
+ style_name = (para.style.name if para.style else "") or ""
70
+ is_heading = bool(re.match(r"Heading\s*\d+", style_name, flags=re.IGNORECASE))
71
+
72
+ if is_heading and text:
73
+ # commit previous section
74
+ if current_title or current_paras:
75
+ sections.append((current_title or "Untitled Section", current_paras))
76
+ current_title = text
77
+ current_paras = []
78
+ else:
79
+ if text:
80
+ current_paras.append(text)
81
+
82
+ # final section
83
+ if current_title or current_paras:
84
+ sections.append((current_title or "Untitled Section", current_paras))
85
+
86
+ # in case no headings at all, make one pseudo-section with all text
87
+ if not sections:
88
+ all_text = [p.text.strip() for p in doc.paragraphs if p.text and p.text.strip()]
89
+ sections = [("Document", all_text)]
90
+
91
+ return sections
92
+
93
+ def _chunk_text_with_context(doc_title: str, section_title: str, paragraphs: List[str], max_words: int = 900) -> List[str]:
94
+ """
95
+ Build chunks that keep: Document Title + Section Title + paragraphs,
96
+ so short bullets like 'Putaway error: ...' stay with their header.
97
+ """
98
+ # Join paras for chunking
99
+ body = "\n".join(paragraphs)
100
+ words = body.split()
101
+ chunks: List[str] = []
102
+ for i in range(0, len(words), max_words):
103
+ chunk_body = ' '.join(words[i:i + max_words])
104
+ chunk_text = f"{doc_title}\n{section_title}\n\n{chunk_body}".strip()
105
+ chunks.append(chunk_text)
106
+ if not chunks and body:
107
+ chunks = [f"{doc_title}\n{section_title}\n\n{body}"]
108
+ return chunks
109
+
110
+ # ------------------------- Ingestion ------------------------------
111
+ def ingest_documents(folder_path: str) -> None:
112
+ """
113
+ Read .docx files, section-aware chunking, generate embeddings, store in ChromaDB,
114
+ and build BM25 inverted index with persistence.
115
+ """
116
+ print(f"πŸ“‚ Checking folder: {folder_path}")
117
+ files = [f for f in os.listdir(folder_path) if f.lower().endswith('.docx')]
118
+ print(f"Found {len(files)} Word files: {files}")
119
+ if not files:
120
+ print("⚠️ No .docx files found. Please check the folder path.")
121
+ return
122
+
123
+ # Reset BM25 memory structures
124
+ global bm25_docs, bm25_inverted, bm25_df, bm25_avgdl, bm25_ready
125
+ bm25_docs = []
126
+ bm25_inverted = {}
127
+ bm25_df = {}
128
+ bm25_avgdl = 0.0
129
+ bm25_ready = False
130
+
131
+ for file in files:
132
+ file_path = os.path.join(folder_path, file)
133
+ doc_title = os.path.splitext(file)[0]
134
+ doc = Document(file_path)
135
+ sections = _split_by_sections(doc)
136
+ total_chunks = 0
137
+
138
+ for s_idx, (section_title, paras) in enumerate(sections):
139
+ chunks = _chunk_text_with_context(doc_title, section_title, paras, max_words=900)
140
+ total_chunks += len(chunks)
141
+
142
+ for c_idx, chunk in enumerate(chunks):
143
+ # Embedding & Chroma
144
+ embedding = model.encode(chunk).tolist()
145
+ doc_id = f"{file}:{s_idx}:{c_idx}" # stable unique id
146
+ meta = {"filename": file, "section": section_title, "chunk_index": c_idx, "title": doc_title, "collection": "SOP"}
147
+
148
+ try:
149
+ collection.add(
150
+ ids=[doc_id],
151
+ embeddings=[embedding],
152
+ documents=[chunk],
153
+ metadatas=[meta],
154
+ )
155
+ except Exception as e:
156
+ # upsert on duplicate
157
+ try:
158
+ collection.delete(ids=[doc_id])
159
+ collection.add(
160
+ ids=[doc_id],
161
+ embeddings=[embedding],
162
+ documents=[chunk],
163
+ metadatas=[meta],
164
+ )
165
+ except Exception as e2:
166
+ print(f"❌ Upsert failed for {doc_id}: {e2}")
167
+
168
+ # BM25 indexing
169
+ tokens = _tokenize(chunk)
170
+ tf: Dict[str, int] = {}
171
+ for t in tokens:
172
+ tf[t] = tf.get(t, 0) + 1
173
+ idx = len(bm25_docs)
174
+ bm25_docs.append({"id": doc_id, "text": chunk, "tokens": tokens, "tf": tf, "length": len(tokens), "meta": meta})
175
+ # update inverted index & df
176
+ seen_terms = set()
177
+ for term in tf.keys():
178
+ bm25_inverted.setdefault(term, []).append(idx)
179
+ if term not in seen_terms:
180
+ bm25_df[term] = bm25_df.get(term, 0) + 1
181
+ seen_terms.add(term)
182
+
183
+ print(f"πŸ“„ Ingested {file} β†’ {total_chunks} chunks")
184
+
185
+ # finalize BM25 stats
186
+ N = len(bm25_docs)
187
+ if N > 0:
188
+ bm25_avgdl = sum(d["length"] for d in bm25_docs) / float(N)
189
+ bm25_ready = True
190
+
191
+ # persist BM25 index
192
+ payload = {
193
+ "bm25_docs": bm25_docs,
194
+ "bm25_inverted": bm25_inverted,
195
+ "bm25_df": bm25_df,
196
+ "bm25_avgdl": bm25_avgdl,
197
+ "BM25_K1": BM25_K1,
198
+ "BM25_B": BM25_B,
199
+ }
200
+ os.makedirs(CHROMA_PATH, exist_ok=True)
201
+ with open(BM25_INDEX_FILE, "wb") as f:
202
+ pickle.dump(payload, f)
203
+ print(f"βœ… BM25 index saved: {BM25_INDEX_FILE}")
204
+
205
+ print(f"βœ… Documents ingested. Total entries in Chroma: {collection.count()}")
206
+
207
+ def _load_bm25_index() -> None:
208
+ """
209
+ Load persisted BM25 index if available.
210
+ """
211
+ global bm25_docs, bm25_inverted, bm25_df, bm25_avgdl, bm25_ready
212
+ if not os.path.exists(BM25_INDEX_FILE):
213
+ return
214
+ try:
215
+ with open(BM25_INDEX_FILE, "rb") as f:
216
+ payload = pickle.load(f)
217
+ bm25_docs = payload.get("bm25_docs", [])
218
+ bm25_inverted = payload.get("bm25_inverted", {})
219
+ bm25_df = payload.get("bm25_df", {})
220
+ bm25_avgdl = payload.get("bm25_avgdl", 0.0)
221
+ # params retained but we keep module-level constants
222
+ bm25_ready = len(bm25_docs) > 0
223
+ if bm25_ready:
224
+ print(f"βœ… BM25 index loaded: {BM25_INDEX_FILE} (docs={len(bm25_docs)})")
225
+ except Exception as e:
226
+ print(f"⚠️ Could not load BM25 index: {e}")
227
+
228
+ # auto-load on import
229
+ _load_bm25_index()
230
+
231
+ # ------------------------- BM25 search ----------------------------------------
232
+ def _bm25_score_for_doc(query_terms: List[str], doc_idx: int) -> float:
233
+ """
234
+ Okapi BM25 score for a given doc.
235
+ """
236
+ if not bm25_ready or doc_idx < 0 or doc_idx >= len(bm25_docs):
237
+ return 0.0
238
+ doc = bm25_docs[doc_idx]
239
+ score = 0.0
240
+ dl = doc["length"] or 1
241
+ for term in query_terms:
242
+ df = bm25_df.get(term, 0)
243
+ if df == 0:
244
+ continue
245
+ tf = doc["tf"].get(term, 0)
246
+ if tf == 0:
247
+ continue
248
+ # BM25 idf
249
+ N = len(bm25_docs)
250
+ idf = max(0.0, ( (N - df + 0.5) / (df + 0.5) ))
251
+ idf = (idf if idf > 0 else 1.0)
252
+ idf = 1.0 * ( (N - df + 0.5) / (df + 0.5) ) # raw ratio
253
+ # typical log form
254
+ try:
255
+ import math
256
+ idf = math.log(idf + 1.0)
257
+ except Exception:
258
+ pass
259
+
260
+ denom = tf + BM25_K1 * (1 - BM25_B + BM25_B * (dl / (bm25_avgdl or 1.0)))
261
+ score += idf * ( (tf * (BM25_K1 + 1)) / (denom or 1.0) )
262
+ return score
263
+
264
+ def bm25_search(query: str, top_k: int = 50) -> List[Tuple[int, float]]:
265
+ """
266
+ Returns a list of (doc_idx, score) sorted by score desc.
267
+ """
268
+ if not bm25_ready:
269
+ return []
270
+ norm = _normalize_query(query)
271
+ q_terms = _tokenize(norm)
272
+ if not q_terms:
273
+ return []
274
+ # collect candidate doc indices via inverted index
275
+ candidates = set()
276
+ for t in q_terms:
277
+ for idx in bm25_inverted.get(t, []):
278
+ candidates.add(idx)
279
+ if not candidates:
280
+ # fallback to brute force if no inverted match
281
+ candidates = set(range(len(bm25_docs)))
282
+
283
+ scored = []
284
+ for idx in candidates:
285
+ s = _bm25_score_for_doc(q_terms, idx)
286
+ if s > 0:
287
+ scored.append((idx, s))
288
+ scored.sort(key=lambda x: x[1], reverse=True)
289
+ return scored[:top_k]
290
+
291
+ # ------------------------- Semantic-only (legacy) ------------------------------
292
+
293
+ def search_knowledge_base(query: str, top_k: int = 10) -> dict:
294
+ """
295
+ Semantic-only search (Chroma). We DO NOT ask for 'ids' in include
296
+ because some Chroma clients reject it; if 'ids' is present in the
297
+ response we will use it, otherwise we synthesize stable IDs from metadata.
298
+ """
299
+ query_embedding = model.encode(query).tolist()
300
+ res = collection.query(
301
+ query_embeddings=[query_embedding],
302
+ n_results=top_k,
303
+ include=['documents', 'metadatas', 'distances'] # ← no 'ids' here
304
+ )
305
+
306
+ # Flatten lists-per-query
307
+ docs_ll = res.get("documents", [[]]) or [[]]
308
+ metas_ll = res.get("metadatas", [[]]) or [[]]
309
+ dists_ll = res.get("distances", [[]]) or [[]]
310
+ ids_ll = res.get("ids", [[]]) or [[]] # some clients still return 'ids' anyway
311
+
312
+ documents = docs_ll[0] if docs_ll else []
313
+ metadatas = metas_ll[0] if metas_ll else []
314
+ distances = dists_ll[0] if dists_ll else []
315
+ ids = ids_ll[0] if ids_ll else []
316
+
317
+ # If 'ids' is missing, synthesize stable IDs from metadata
318
+ if not ids and documents:
319
+ synthesized = []
320
+ for i, m in enumerate(metadatas):
321
+ fn = (m or {}).get("filename", "unknown")
322
+ sec = (m or {}).get("section", "section")
323
+ idx = (m or {}).get("chunk_index", i)
324
+ synthesized.append(f"{fn}:{sec}:{idx}")
325
+ ids = synthesized
326
+
327
+ print(f"πŸ”Ž KB search β†’ {len(documents)} docs (top_k={top_k}); "
328
+ f"first distance: {distances[0] if distances else 'n/a'}; ids={len(ids)}")
329
+
330
+ return {
331
+ "documents": documents,
332
+ "metadatas": metadatas,
333
+ "distances": distances,
334
+ "ids": ids,
335
+ }
336
+
337
+ # ------------------------- Hybrid (BM25 + Embeddings) -------------------------
338
+ def hybrid_search_knowledge_base(query: str, top_k: int = 10, alpha: float = 0.6, beta: float = 0.4) -> dict:
339
+ """
340
+ Hybrid retrieval:
341
+ - Semantic (Chroma/embeddings) β†’ distances (lower = better) β†’ convert to similarity
342
+ - BM25 keyword β†’ score (higher = better)
343
+ - Re-rank union of candidates by: final = alpha * semantic_sim + beta * bm25_norm
344
+
345
+ Returns a dict compatible with the extractor but also includes:
346
+ - 'ids': list[str]
347
+ - 'combined_scores': list[float] (0..1)
348
+ - 'distances': list[float] from semantic (may be missing if fetched from BM25-only)
349
+ """
350
+ # 1) Normalize query (language-agnostic, no domain synonyms)
351
+ norm_query = _normalize_query(query)
352
+
353
+ # 2) Semantic candidates (Chroma)
354
+ sem_res = search_knowledge_base(norm_query, top_k=max(top_k, 30))
355
+ sem_docs = sem_res.get("documents", [])
356
+ sem_metas = sem_res.get("metadatas", [])
357
+ sem_dists = sem_res.get("distances", [])
358
+ sem_ids = sem_res.get("ids", [])
359
+
360
+ # Convert distances to 0..1 similarity (simple monotonic mapping)
361
+ def dist_to_sim(d: Optional[float]) -> float:
362
+ if d is None:
363
+ return 0.0
364
+ try:
365
+ return 1.0 / (1.0 + float(d)) # lower distance -> higher sim
366
+ except Exception:
367
+ return 0.0
368
+
369
+ sem_sims = [dist_to_sim(d) for d in sem_dists]
370
+
371
+ # 3) BM25 candidates
372
+ bm25_hits = bm25_search(norm_query, top_k=max(50, top_k * 5))
373
+ # normalize BM25 scores to 0..1
374
+ bm25_max = max([s for _, s in bm25_hits], default=1.0)
375
+ bm25_norm_pairs = [(idx, (score / bm25_max) if bm25_max > 0 else 0.0) for idx, score in bm25_hits]
376
+
377
+ # 4) Merge candidates by doc_id
378
+ # For BM25 doc_idx β†’ get doc info
379
+ bm25_id_to_norm: Dict[str, float] = {}
380
+ bm25_id_to_text: Dict[str, str] = {}
381
+ bm25_id_to_meta: Dict[str, Dict[str, Any]] = {}
382
+ for idx, nscore in bm25_norm_pairs:
383
+ d = bm25_docs[idx]
384
+ bm25_id_to_norm[d["id"]] = nscore
385
+ bm25_id_to_text[d["id"]] = d["text"]
386
+ bm25_id_to_meta[d["id"]] = d["meta"]
387
+
388
+ # Build union
389
+ union_ids = set(sem_ids) | set(bm25_id_to_norm.keys())
390
+
391
+ # 5) For each candidate id, compute combined score and collect fields
392
+ combined_records: List[Tuple[str, float, float, str, Dict[str, Any]]] = []
393
+ for cid in union_ids:
394
+ # semantic part
395
+ if cid in sem_ids:
396
+ pos = sem_ids.index(cid)
397
+ sem_sim = sem_sims[pos] if pos < len(sem_sims) else 0.0
398
+ sem_dist = sem_dists[pos] if pos < len(sem_dists) else None
399
+ sem_text = sem_docs[pos] if pos < len(sem_docs) else ""
400
+ sem_meta = sem_metas[pos] if pos < len(sem_metas) else {}
401
+ else:
402
+ sem_sim, sem_dist, sem_text, sem_meta = 0.0, None, "", {}
403
+
404
+ # bm25 part
405
+ bm25_sim = bm25_id_to_norm.get(cid, 0.0)
406
+ bm25_text = bm25_id_to_text.get(cid, "")
407
+ bm25_meta = bm25_id_to_meta.get(cid, {})
408
+
409
+ # prefer non-empty text/meta
410
+ text = sem_text if sem_text else bm25_text
411
+ meta = sem_meta if sem_meta else bm25_meta
412
+
413
+ # final combined score
414
+ final_score = alpha * sem_sim + beta * bm25_sim
415
+ combined_records.append((cid, final_score, (sem_dist if sem_dist is not None else 999.0), text, meta))
416
+
417
+ # 6) Sort by combined score desc and take top_k
418
+ combined_records.sort(key=lambda x: x[1], reverse=True)
419
+ top = combined_records[:top_k]
420
+
421
+ documents = [t[3] for t in top]
422
+ metadatas = [t[4] for t in top]
423
+ distances = [t[2] for t in top] # keep semantic distance (999 if BM25-only)
424
+ ids = [t[0] for t in top]
425
+ combined_scores = [t[1] for t in top]
426
+
427
+ return {
428
+ "documents": documents,
429
+ "metadatas": metadatas,
430
+ "distances": distances,
431
+ "ids": ids,
432
+ "combined_scores": combined_scores,
433
+ }