import os from docx import Document from sentence_transformers import SentenceTransformer import chromadb # Initialize ChromaDB client client = chromadb.PersistentClient(path="chroma_db") collection = client.get_or_create_collection(name="knowledge_base") # Load embedding model model = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2") """ def extract_text_from_docx(file_path): doc = Document(file_path) return '\n'.join([para.text for para in doc.paragraphs if para.text.strip()]) def chunk_text(text, max_words=300): words = text.split() return [' '.join(words[i:i + max_words]) for i in range(0, len(words), max_words)] def ingest_documents(file_path): if not os.path.exists(file_path): return {"status": "error", "message": f"File not found: {file_path}"} text = extract_text_from_docx(file_path) if not text.strip(): return {"status": "error", "message": "Document is empty"} chunks = chunk_text(text) print(f"📄 Ingesting {os.path.basename(file_path)} with {len(chunks)} chunks") for i, chunk in enumerate(chunks): embedding = model.encode(chunk).tolist() doc_id = f"{os.path.basename(file_path)}_{i}" collection.add( ids=[doc_id], embeddings=[embedding], documents=[chunk], metadatas=[{"filename": os.path.basename(file_path)}] ) return {"status": "success", "chunks": len(chunks), "message": "Ingestion completed"} def ingest_all_documents(folder_path): files = [f for f in os.listdir(folder_path) if f.endswith('.docx')] if not files: return {"status": "error", "message": "No .docx files found"} for file in files: file_path = os.path.join(folder_path, file) ingest_documents(file_path) return {"status": "success", "message": f"Ingested {len(files)} files"} def search_knowledge_base(query, top_k=3): query_embedding = model.encode(query).tolist() print("query_embedding",query_embedding) results = collection.query( query_embeddings=[query_embedding], n_results=top_k, include=['documents', 'metadatas', 'distances'] ) print("results",results) return results """ def extract_text_from_docx(file_path): """Extract text from a .docx file.""" #print("file_path",file_path) doc = Document(file_path) return '\n'.join([para.text for para in doc.paragraphs]) def chunk_text(text, max_words=300): """Split text into smaller chunks for better embedding quality.""" words = text.split() return [' '.join(words[i:i + max_words]) for i in range(0, len(words), max_words)] def ingest_documents(folder_path): """Read .docx files, chunk text, generate embeddings, and store in ChromaDB.""" #print(f"📂 Checking folder: {folder_path}") files = [f for f in os.listdir(folder_path) if f.endswith('.docx')] #print(f"Found {len(files)} Word files: {files}") if not files: print("⚠️ No .docx files found. Please check the folder path.") return for file in files: file_path = os.path.join(folder_path, file) text = extract_text_from_docx(file_path) chunks = chunk_text(text) #print(f"📄 Ingesting {file} with {len(chunks)} chunks") for i, chunk in enumerate(chunks): embedding = model.encode(chunk).tolist() doc_id = f"{file}_{i}" collection.add( ids=[doc_id], embeddings=[embedding], documents=[chunk], metadatas=[{"filename": file}] ) print(f"✅ Documents ingested. Total entries: {collection.count()}") def search_knowledge_base(query, top_k=3): """Search ChromaDB using semantic similarity.""" query_embedding = model.encode(query).tolist() results = collection.query(query_embeddings=[query_embedding], n_results=top_k,include=['embeddings','documents', 'metadatas', 'distances']) #print("results",results) return results # Example usage: # ingest_documents("path/to/docs") # results = search_knowledge_base("inventory mismatch in warehouse") # print(results) #def main(): # folder_path = r"C:/Users/UD724VY/Downloads/Chatbot_V1/Chatbot_V1/Chatbot_FastAPI/GenericSOPsForTesting" # print(os.path.exists(r"C:/Users/UD724VY/Downloads/Chatbot_V1/Chatbot_FastAPI/GenericSOPsForTesting")) # if os.path.exists(folder_path): # print(folder_path) # ingest_documents(folder_path) # else: # print("❌ Invalid folder path. Please check and try again.") # #if __name__ == "__main__": # print("Running main()", flush=True) # main()