|
|
from copy import deepcopy |
|
|
|
|
|
from chromadb.config import Settings |
|
|
from langchain_chroma import Chroma |
|
|
from loguru import logger |
|
|
|
|
|
from langflow.base.vectorstores.model import LCVectorStoreComponent, check_cached_vector_store |
|
|
from langflow.base.vectorstores.utils import chroma_collection_to_data |
|
|
from langflow.io import BoolInput, DataInput, DropdownInput, HandleInput, IntInput, MultilineInput, StrInput |
|
|
from langflow.schema import Data |
|
|
|
|
|
|
|
|
class ChromaVectorStoreComponent(LCVectorStoreComponent): |
|
|
"""Chroma Vector Store with search capabilities.""" |
|
|
|
|
|
display_name: str = "Chroma DB" |
|
|
description: str = "Chroma Vector Store with search capabilities" |
|
|
documentation = "https://python.langchain.com/docs/integrations/vectorstores/chroma" |
|
|
name = "Chroma" |
|
|
icon = "Chroma" |
|
|
|
|
|
inputs = [ |
|
|
StrInput( |
|
|
name="collection_name", |
|
|
display_name="Collection Name", |
|
|
value="langflow", |
|
|
), |
|
|
StrInput( |
|
|
name="persist_directory", |
|
|
display_name="Persist Directory", |
|
|
), |
|
|
MultilineInput( |
|
|
name="search_query", |
|
|
display_name="Search Query", |
|
|
), |
|
|
DataInput( |
|
|
name="ingest_data", |
|
|
display_name="Ingest Data", |
|
|
is_list=True, |
|
|
), |
|
|
HandleInput(name="embedding", display_name="Embedding", input_types=["Embeddings"]), |
|
|
StrInput( |
|
|
name="chroma_server_cors_allow_origins", |
|
|
display_name="Server CORS Allow Origins", |
|
|
advanced=True, |
|
|
), |
|
|
StrInput( |
|
|
name="chroma_server_host", |
|
|
display_name="Server Host", |
|
|
advanced=True, |
|
|
), |
|
|
IntInput( |
|
|
name="chroma_server_http_port", |
|
|
display_name="Server HTTP Port", |
|
|
advanced=True, |
|
|
), |
|
|
IntInput( |
|
|
name="chroma_server_grpc_port", |
|
|
display_name="Server gRPC Port", |
|
|
advanced=True, |
|
|
), |
|
|
BoolInput( |
|
|
name="chroma_server_ssl_enabled", |
|
|
display_name="Server SSL Enabled", |
|
|
advanced=True, |
|
|
), |
|
|
BoolInput( |
|
|
name="allow_duplicates", |
|
|
display_name="Allow Duplicates", |
|
|
advanced=True, |
|
|
info="If false, will not add documents that are already in the Vector Store.", |
|
|
), |
|
|
DropdownInput( |
|
|
name="search_type", |
|
|
display_name="Search Type", |
|
|
options=["Similarity", "MMR"], |
|
|
value="Similarity", |
|
|
advanced=True, |
|
|
), |
|
|
IntInput( |
|
|
name="number_of_results", |
|
|
display_name="Number of Results", |
|
|
info="Number of results to return.", |
|
|
advanced=True, |
|
|
value=10, |
|
|
), |
|
|
IntInput( |
|
|
name="limit", |
|
|
display_name="Limit", |
|
|
advanced=True, |
|
|
info="Limit the number of records to compare when Allow Duplicates is False.", |
|
|
), |
|
|
] |
|
|
|
|
|
@check_cached_vector_store |
|
|
def build_vector_store(self) -> Chroma: |
|
|
"""Builds the Chroma object.""" |
|
|
try: |
|
|
from chromadb import Client |
|
|
from langchain_chroma import Chroma |
|
|
except ImportError as e: |
|
|
msg = "Could not import Chroma integration package. Please install it with `pip install langchain-chroma`." |
|
|
raise ImportError(msg) from e |
|
|
|
|
|
chroma_settings = None |
|
|
client = None |
|
|
if self.chroma_server_host: |
|
|
chroma_settings = Settings( |
|
|
chroma_server_cors_allow_origins=self.chroma_server_cors_allow_origins or [], |
|
|
chroma_server_host=self.chroma_server_host, |
|
|
chroma_server_http_port=self.chroma_server_http_port or None, |
|
|
chroma_server_grpc_port=self.chroma_server_grpc_port or None, |
|
|
chroma_server_ssl_enabled=self.chroma_server_ssl_enabled, |
|
|
) |
|
|
client = Client(settings=chroma_settings) |
|
|
|
|
|
|
|
|
persist_directory = self.resolve_path(self.persist_directory) if self.persist_directory is not None else None |
|
|
|
|
|
chroma = Chroma( |
|
|
persist_directory=persist_directory, |
|
|
client=client, |
|
|
embedding_function=self.embedding, |
|
|
collection_name=self.collection_name, |
|
|
) |
|
|
|
|
|
self._add_documents_to_vector_store(chroma) |
|
|
self.status = chroma_collection_to_data(chroma.get(limit=self.limit)) |
|
|
return chroma |
|
|
|
|
|
def _add_documents_to_vector_store(self, vector_store: "Chroma") -> None: |
|
|
"""Adds documents to the Vector Store.""" |
|
|
if not self.ingest_data: |
|
|
self.status = "" |
|
|
return |
|
|
|
|
|
stored_documents_without_id = [] |
|
|
if self.allow_duplicates: |
|
|
stored_data = [] |
|
|
else: |
|
|
stored_data = chroma_collection_to_data(vector_store.get(limit=self.limit)) |
|
|
for value in deepcopy(stored_data): |
|
|
del value.id |
|
|
stored_documents_without_id.append(value) |
|
|
|
|
|
documents = [] |
|
|
for _input in self.ingest_data or []: |
|
|
if isinstance(_input, Data): |
|
|
if _input not in stored_documents_without_id: |
|
|
documents.append(_input.to_lc_document()) |
|
|
else: |
|
|
msg = "Vector Store Inputs must be Data objects." |
|
|
raise TypeError(msg) |
|
|
|
|
|
if documents and self.embedding is not None: |
|
|
logger.debug(f"Adding {len(documents)} documents to the Vector Store.") |
|
|
vector_store.add_documents(documents) |
|
|
else: |
|
|
logger.debug("No documents to add to the Vector Store.") |
|
|
|