Jaita commited on
Commit
37ff4d9
·
verified ·
1 Parent(s): 8fe898e

Create services/kb_creation.py

Browse files
Files changed (1) hide show
  1. services/kb_creation.py +63 -0
services/kb_creation.py ADDED
@@ -0,0 +1,63 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ from docx import Document
3
+ from sentence_transformers import SentenceTransformer
4
+ import chromadb
5
+
6
+ # Initialize ChromaDB client
7
+ client = chromadb.PersistentClient(path="chroma_db")
8
+ collection = client.get_or_create_collection(name="knowledge_base")
9
+
10
+ # Load embedding model
11
+ model = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2")
12
+
13
+
14
+ def extract_text_from_docx(file_path):
15
+ """Extract text from a .docx file."""
16
+ #print("file_path",file_path)
17
+ doc = Document(file_path)
18
+ return '\n'.join([para.text for para in doc.paragraphs])
19
+
20
+
21
+
22
+ def chunk_text(text, max_words=300):
23
+ """Split text into smaller chunks for better embedding quality."""
24
+ words = text.split()
25
+ return [' '.join(words[i:i + max_words]) for i in range(0, len(words), max_words)]
26
+
27
+
28
+
29
+ def ingest_documents(folder_path):
30
+ """Read .docx files, chunk text, generate embeddings, and store in ChromaDB."""
31
+ #print(f"📂 Checking folder: {folder_path}")
32
+ files = [f for f in os.listdir(folder_path) if f.endswith('.docx')]
33
+ #print(f"Found {len(files)} Word files: {files}")
34
+
35
+ if not files:
36
+ print("⚠️ No .docx files found. Please check the folder path.")
37
+ return
38
+
39
+ for file in files:
40
+ file_path = os.path.join(folder_path, file)
41
+ text = extract_text_from_docx(file_path)
42
+ chunks = chunk_text(text)
43
+
44
+ #print(f"📄 Ingesting {file} with {len(chunks)} chunks")
45
+
46
+ for i, chunk in enumerate(chunks):
47
+ embedding = model.encode(chunk).tolist()
48
+ doc_id = f"{file}_{i}"
49
+ collection.add(
50
+ ids=[doc_id],
51
+ embeddings=[embedding],
52
+ documents=[chunk],
53
+ metadatas=[{"filename": file}]
54
+ )
55
+
56
+ print(f"✅ Documents ingested. Total entries: {collection.count()}")
57
+
58
+ def search_knowledge_base(query, top_k=3):
59
+ """Search ChromaDB using semantic similarity."""
60
+ query_embedding = model.encode(query).tolist()
61
+ results = collection.query(query_embeddings=[query_embedding], n_results=top_k,include=['embeddings','documents', 'metadatas', 'distances'])
62
+ #print("results",results)
63
+ return results