Spaces:
Paused
Paused
File size: 2,314 Bytes
e5902c3 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 |
from langchain.storage import InMemoryByteStore, LocalFileStore
from langchain.embeddings import CacheBackedEmbeddings
from langchain_chroma import Chroma
from . import files
from langchain_core.documents import Document
import uuid
class VectorDB:
def __init__(self, embeddings_model, in_memory=False, cache_dir="./cache"):
print("Initializing VectorDB...")
self.embeddings_model = embeddings_model
em_cache = files.get_abs_path(cache_dir,"embeddings")
db_cache = files.get_abs_path(cache_dir,"database")
if in_memory:
self.store = InMemoryByteStore()
else:
self.store = LocalFileStore(em_cache)
#here we setup the embeddings model with the chosen cache storage
self.embedder = CacheBackedEmbeddings.from_bytes_store(
embeddings_model,
self.store,
namespace=getattr(embeddings_model, 'model', getattr(embeddings_model, 'model_name', "default")) )
self.db = Chroma(embedding_function=self.embedder,persist_directory=db_cache)
def search_similarity(self, query, results=3):
return self.db.similarity_search(query,results)
def search_max_rel(self, query, results=3):
return self.db.max_marginal_relevance_search(query,results)
def delete_documents(self, query):
score_limit = 1
k = 2
tot = 0
while True:
# Perform similarity search with score
docs = self.db.similarity_search_with_score(query, k=k)
# Extract document IDs and filter based on score
document_ids = [result[0].metadata["id"] for result in docs if result[1] < score_limit]
# Delete documents with IDs over the threshold score
if document_ids:
fnd = self.db.get(where={"id": {"$in": document_ids}})
if fnd["ids"]: self.db.delete(ids=fnd["ids"])
tot += len(fnd["ids"])
# If fewer than K document IDs, break the loop
if len(document_ids) < k:
break
return tot
def insert_document(self, data):
id = str(uuid.uuid4())
self.db.add_documents(documents=[ Document(data, metadata={"id": id}) ])
return id
|