|
|
import chromadb |
|
|
from sentence_transformers import SentenceTransformer |
|
|
import os |
|
|
|
|
|
|
|
|
MODEL_NAME = "all-MiniLM-L6-v2" |
|
|
COLLECTION_NAME = "aura_mind_knowledge" |
|
|
KNOWLEDGE_BASE_DIR = "knowledge_base_data" |
|
|
|
|
|
|
|
|
client = chromadb.PersistentClient(path="chroma_db") |
|
|
model = SentenceTransformer(MODEL_NAME) |
|
|
collection = client.get_or_create_collection(name=COLLECTION_NAME) |
|
|
|
|
|
def embed_and_store_documents(): |
|
|
""" |
|
|
Reads documents from the knowledge base directory, generates embeddings, |
|
|
and stores them in ChromaDB. |
|
|
""" |
|
|
if collection.count() > 0: |
|
|
print("✅ Knowledge base is already loaded into ChromaDB.") |
|
|
return |
|
|
|
|
|
print("Embedding and storing documents in ChromaDB...") |
|
|
documents = [] |
|
|
ids = [] |
|
|
for filename in os.listdir(KNOWLEDGE_BASE_DIR): |
|
|
if filename.endswith(".txt"): |
|
|
with open(os.path.join(KNOWLEDGE_BASE_DIR, filename), "r") as f: |
|
|
documents.append(f.read()) |
|
|
ids.append(filename) |
|
|
|
|
|
if documents: |
|
|
embeddings = model.encode(documents).tolist() |
|
|
collection.add( |
|
|
embeddings=embeddings, |
|
|
documents=documents, |
|
|
ids=ids |
|
|
) |
|
|
print(f"✅ Successfully stored {len(documents)} documents in ChromaDB.") |
|
|
|
|
|
def search_documents(query: str, n_results: int = 1) -> list: |
|
|
""" |
|
|
Searches for relevant documents in ChromaDB based on a query. |
|
|
|
|
|
Args: |
|
|
query: The search query. |
|
|
n_results: The number of results to return. |
|
|
|
|
|
Returns: |
|
|
A list of relevant documents. |
|
|
""" |
|
|
if not query: |
|
|
return [] |
|
|
|
|
|
query_embedding = model.encode([query]).tolist() |
|
|
results = collection.query( |
|
|
query_embeddings=query_embedding, |
|
|
n_results=n_results, |
|
|
) |
|
|
return results['documents'][0] if results['documents'] else [] |
|
|
|