chatbot-backend / services /kb_creation.py
Jaita's picture
Create services/kb_creation.py
37ff4d9 verified
import os
from docx import Document
from sentence_transformers import SentenceTransformer
import chromadb
# Initialize ChromaDB client
client = chromadb.PersistentClient(path="chroma_db")
collection = client.get_or_create_collection(name="knowledge_base")
# Load embedding model
model = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2")
def extract_text_from_docx(file_path):
"""Extract text from a .docx file."""
#print("file_path",file_path)
doc = Document(file_path)
return '\n'.join([para.text for para in doc.paragraphs])
def chunk_text(text, max_words=300):
"""Split text into smaller chunks for better embedding quality."""
words = text.split()
return [' '.join(words[i:i + max_words]) for i in range(0, len(words), max_words)]
def ingest_documents(folder_path):
"""Read .docx files, chunk text, generate embeddings, and store in ChromaDB."""
#print(f"πŸ“‚ Checking folder: {folder_path}")
files = [f for f in os.listdir(folder_path) if f.endswith('.docx')]
#print(f"Found {len(files)} Word files: {files}")
if not files:
print("⚠️ No .docx files found. Please check the folder path.")
return
for file in files:
file_path = os.path.join(folder_path, file)
text = extract_text_from_docx(file_path)
chunks = chunk_text(text)
#print(f"πŸ“„ Ingesting {file} with {len(chunks)} chunks")
for i, chunk in enumerate(chunks):
embedding = model.encode(chunk).tolist()
doc_id = f"{file}_{i}"
collection.add(
ids=[doc_id],
embeddings=[embedding],
documents=[chunk],
metadatas=[{"filename": file}]
)
print(f"βœ… Documents ingested. Total entries: {collection.count()}")
def search_knowledge_base(query, top_k=3):
"""Search ChromaDB using semantic similarity."""
query_embedding = model.encode(query).tolist()
results = collection.query(query_embeddings=[query_embedding], n_results=top_k,include=['embeddings','documents', 'metadatas', 'distances'])
#print("results",results)
return results