Spaces:

NOVA-chatbot
/

chatbot-backend

Sleeping

App Files Files Community

Jaita commited on Dec 8, 2025

Commit

37ff4d9

verified ·

1 Parent(s): 8fe898e

Create services/kb_creation.py

Browse files

Files changed (1) hide show

services/kb_creation.py +63 -0

services/kb_creation.py ADDED Viewed

	@@ -0,0 +1,63 @@

+import os
+from docx import Document
+from sentence_transformers import SentenceTransformer
+import chromadb
+# Initialize ChromaDB client
+client = chromadb.PersistentClient(path="chroma_db")
+collection = client.get_or_create_collection(name="knowledge_base")
+# Load embedding model
+model = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2")
+def extract_text_from_docx(file_path):
+    """Extract text from a .docx file."""
+    #print("file_path",file_path)
+    doc = Document(file_path)
+    return '\n'.join([para.text for para in doc.paragraphs])
+def chunk_text(text, max_words=300):
+    """Split text into smaller chunks for better embedding quality."""
+    words = text.split()
+    return [' '.join(words[i:i + max_words]) for i in range(0, len(words), max_words)]
+def ingest_documents(folder_path):
+    """Read .docx files, chunk text, generate embeddings, and store in ChromaDB."""
+    #print(f"📂 Checking folder: {folder_path}")
+    files = [f for f in os.listdir(folder_path) if f.endswith('.docx')]
+    #print(f"Found {len(files)} Word files: {files}")
+    if not files:
+        print("⚠️ No .docx files found. Please check the folder path.")
+        return
+    for file in files:
+        file_path = os.path.join(folder_path, file)
+        text = extract_text_from_docx(file_path)
+        chunks = chunk_text(text)
+        #print(f"📄 Ingesting {file} with {len(chunks)} chunks")
+        for i, chunk in enumerate(chunks):
+            embedding = model.encode(chunk).tolist()
+            doc_id = f"{file}_{i}"
+            collection.add(
+                ids=[doc_id],
+                embeddings=[embedding],
+                documents=[chunk],
+                metadatas=[{"filename": file}]
+            )
+    print(f"✅ Documents ingested. Total entries: {collection.count()}")
+def search_knowledge_base(query, top_k=3):
+    """Search ChromaDB using semantic similarity."""
+    query_embedding = model.encode(query).tolist()
+    results = collection.query(query_embeddings=[query_embedding], n_results=top_k,include=['embeddings','documents', 'metadatas', 'distances'])
+    #print("results",results)
+    return results