Spaces:

Jaita
/

chatbot-fastapi-backend

Sleeping

App Files Files Community

chatbot-fastapi-backend / services /kb_creation.py

Jaita

Update services/kb_creation.py

190e142 verified 3 months ago

raw

history blame contribute delete

4.67 kB

	import os
	from docx import Document
	from sentence_transformers import SentenceTransformer
	import chromadb

	# Initialize ChromaDB client
	client = chromadb.PersistentClient(path="chroma_db")
	collection = client.get_or_create_collection(name="knowledge_base")

	# Load embedding model
	model = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2")
	"""
	def extract_text_from_docx(file_path):
	doc = Document(file_path)
	return '\n'.join([para.text for para in doc.paragraphs if para.text.strip()])

	def chunk_text(text, max_words=300):
	words = text.split()
	return [' '.join(words[i:i + max_words]) for i in range(0, len(words), max_words)]

	def ingest_documents(file_path):
	if not os.path.exists(file_path):
	return {"status": "error", "message": f"File not found: {file_path}"}

	text = extract_text_from_docx(file_path)
	if not text.strip():
	return {"status": "error", "message": "Document is empty"}

	chunks = chunk_text(text)
	print(f"📄 Ingesting {os.path.basename(file_path)} with {len(chunks)} chunks")

	for i, chunk in enumerate(chunks):
	embedding = model.encode(chunk).tolist()
	doc_id = f"{os.path.basename(file_path)}_{i}"
	collection.add(
	ids=[doc_id],
	embeddings=[embedding],
	documents=[chunk],
	metadatas=[{"filename": os.path.basename(file_path)}]
	)

	return {"status": "success", "chunks": len(chunks), "message": "Ingestion completed"}

	def ingest_all_documents(folder_path):
	files = [f for f in os.listdir(folder_path) if f.endswith('.docx')]
	if not files:
	return {"status": "error", "message": "No .docx files found"}

	for file in files:
	file_path = os.path.join(folder_path, file)
	ingest_documents(file_path)

	return {"status": "success", "message": f"Ingested {len(files)} files"}

	def search_knowledge_base(query, top_k=3):
	query_embedding = model.encode(query).tolist()
	print("query_embedding",query_embedding)
	results = collection.query(
	query_embeddings=[query_embedding],
	n_results=top_k,
	include=['documents', 'metadatas', 'distances']
	)
	print("results",results)
	return results
	"""
	def extract_text_from_docx(file_path):
	"""Extract text from a .docx file."""
	#print("file_path",file_path)
	doc = Document(file_path)
	return '\n'.join([para.text for para in doc.paragraphs])

	def chunk_text(text, max_words=300):
	"""Split text into smaller chunks for better embedding quality."""
	words = text.split()
	return [' '.join(words[i:i + max_words]) for i in range(0, len(words), max_words)]

	def ingest_documents(folder_path):
	"""Read .docx files, chunk text, generate embeddings, and store in ChromaDB."""
	#print(f"📂 Checking folder: {folder_path}")
	files = [f for f in os.listdir(folder_path) if f.endswith('.docx')]
	#print(f"Found {len(files)} Word files: {files}")

	if not files:
	print("⚠️ No .docx files found. Please check the folder path.")
	return

	for file in files:
	file_path = os.path.join(folder_path, file)
	text = extract_text_from_docx(file_path)
	chunks = chunk_text(text)

	#print(f"📄 Ingesting {file} with {len(chunks)} chunks")

	for i, chunk in enumerate(chunks):
	embedding = model.encode(chunk).tolist()
	doc_id = f"{file}_{i}"
	collection.add(
	ids=[doc_id],
	embeddings=[embedding],
	documents=[chunk],
	metadatas=[{"filename": file}]
	)

	print(f"✅ Documents ingested. Total entries: {collection.count()}")

	def search_knowledge_base(query, top_k=3):
	"""Search ChromaDB using semantic similarity."""
	query_embedding = model.encode(query).tolist()
	results = collection.query(query_embeddings=[query_embedding], n_results=top_k,include=['embeddings','documents', 'metadatas', 'distances'])
	#print("results",results)
	return results

	# Example usage:
	# ingest_documents("path/to/docs")
	# results = search_knowledge_base("inventory mismatch in warehouse")
	# print(results)

	#def main():
	# folder_path = r"C:/Users/UD724VY/Downloads/Chatbot_V1/Chatbot_V1/Chatbot_FastAPI/GenericSOPsForTesting"
	# print(os.path.exists(r"C:/Users/UD724VY/Downloads/Chatbot_V1/Chatbot_FastAPI/GenericSOPsForTesting"))
	# if os.path.exists(folder_path):
	# print(folder_path)
	# ingest_documents(folder_path)
	# else:
	# print("❌ Invalid folder path. Please check and try again.")
	#
	#if __name__ == "__main__":
	# print("Running main()", flush=True)
	# main()