Spaces:
Runtime error
Runtime error
| from abc import ABC | |
| from typing import TYPE_CHECKING, Any, Iterable, List, Optional, Tuple, Type | |
| import numpy as np | |
| from langchain_core.documents import Document | |
| from langchain_core.embeddings import Embeddings | |
| from langchain_core.pydantic_v1 import Field | |
| from langchain_core.vectorstores import VectorStore | |
| from langchain.vectorstores.utils import maximal_marginal_relevance | |
| if TYPE_CHECKING: | |
| from docarray import BaseDoc | |
| from docarray.index.abstract import BaseDocIndex | |
| def _check_docarray_import() -> None: | |
| try: | |
| import docarray | |
| da_version = docarray.__version__.split(".") | |
| if int(da_version[0]) == 0 and int(da_version[1]) <= 31: | |
| raise ImportError( | |
| f"To use the DocArrayHnswSearch VectorStore the docarray " | |
| f"version >=0.32.0 is expected, received: {docarray.__version__}." | |
| f"To upgrade, please run: `pip install -U docarray`." | |
| ) | |
| except ImportError: | |
| raise ImportError( | |
| "Could not import docarray python package. " | |
| 'Please install it with `pip install "langchain[docarray]"`.' | |
| ) | |
| class DocArrayIndex(VectorStore, ABC): | |
| """Base class for `DocArray` based vector stores.""" | |
| def __init__( | |
| self, | |
| doc_index: "BaseDocIndex", | |
| embedding: Embeddings, | |
| ): | |
| """Initialize a vector store from DocArray's DocIndex.""" | |
| self.doc_index = doc_index | |
| self.embedding = embedding | |
| def _get_doc_cls(**embeddings_params: Any) -> Type["BaseDoc"]: | |
| """Get docarray Document class describing the schema of DocIndex.""" | |
| from docarray import BaseDoc | |
| from docarray.typing import NdArray | |
| class DocArrayDoc(BaseDoc): | |
| text: Optional[str] | |
| embedding: Optional[NdArray] = Field(**embeddings_params) | |
| metadata: Optional[dict] | |
| return DocArrayDoc | |
| def doc_cls(self) -> Type["BaseDoc"]: | |
| if self.doc_index._schema is None: | |
| raise ValueError("doc_index expected to have non-null _schema attribute.") | |
| return self.doc_index._schema | |
| def add_texts( | |
| self, | |
| texts: Iterable[str], | |
| metadatas: Optional[List[dict]] = None, | |
| **kwargs: Any, | |
| ) -> List[str]: | |
| """Embed texts and add to the vector store. | |
| Args: | |
| texts: Iterable of strings to add to the vectorstore. | |
| metadatas: Optional list of metadatas associated with the texts. | |
| Returns: | |
| List of ids from adding the texts into the vectorstore. | |
| """ | |
| ids: List[str] = [] | |
| embeddings = self.embedding.embed_documents(list(texts)) | |
| for i, (t, e) in enumerate(zip(texts, embeddings)): | |
| m = metadatas[i] if metadatas else {} | |
| doc = self.doc_cls(text=t, embedding=e, metadata=m) | |
| self.doc_index.index([doc]) | |
| ids.append(str(doc.id)) | |
| return ids | |
| def similarity_search_with_score( | |
| self, query: str, k: int = 4, **kwargs: Any | |
| ) -> List[Tuple[Document, float]]: | |
| """Return docs most similar to query. | |
| Args: | |
| query: Text to look up documents similar to. | |
| k: Number of Documents to return. Defaults to 4. | |
| Returns: | |
| List of documents most similar to the query text and | |
| cosine distance in float for each. | |
| Lower score represents more similarity. | |
| """ | |
| query_embedding = self.embedding.embed_query(query) | |
| query_doc = self.doc_cls(embedding=query_embedding) # type: ignore | |
| docs, scores = self.doc_index.find(query_doc, search_field="embedding", limit=k) | |
| result = [ | |
| (Document(page_content=doc.text, metadata=doc.metadata), score) | |
| for doc, score in zip(docs, scores) | |
| ] | |
| return result | |
| def similarity_search( | |
| self, query: str, k: int = 4, **kwargs: Any | |
| ) -> List[Document]: | |
| """Return docs most similar to query. | |
| Args: | |
| query: Text to look up documents similar to. | |
| k: Number of Documents to return. Defaults to 4. | |
| Returns: | |
| List of Documents most similar to the query. | |
| """ | |
| results = self.similarity_search_with_score(query, k=k, **kwargs) | |
| return [doc for doc, _ in results] | |
| def _similarity_search_with_relevance_scores( | |
| self, | |
| query: str, | |
| k: int = 4, | |
| **kwargs: Any, | |
| ) -> List[Tuple[Document, float]]: | |
| """Return docs and relevance scores, normalized on a scale from 0 to 1. | |
| 0 is dissimilar, 1 is most similar. | |
| """ | |
| raise NotImplementedError() | |
| def similarity_search_by_vector( | |
| self, embedding: List[float], k: int = 4, **kwargs: Any | |
| ) -> List[Document]: | |
| """Return docs most similar to embedding vector. | |
| Args: | |
| embedding: Embedding to look up documents similar to. | |
| k: Number of Documents to return. Defaults to 4. | |
| Returns: | |
| List of Documents most similar to the query vector. | |
| """ | |
| query_doc = self.doc_cls(embedding=embedding) # type: ignore | |
| docs = self.doc_index.find( | |
| query_doc, search_field="embedding", limit=k | |
| ).documents | |
| result = [ | |
| Document(page_content=doc.text, metadata=doc.metadata) for doc in docs | |
| ] | |
| return result | |
| def max_marginal_relevance_search( | |
| self, | |
| query: str, | |
| k: int = 4, | |
| fetch_k: int = 20, | |
| lambda_mult: float = 0.5, | |
| **kwargs: Any, | |
| ) -> List[Document]: | |
| """Return docs selected using the maximal marginal relevance. | |
| Maximal marginal relevance optimizes for similarity to query AND diversity | |
| among selected documents. | |
| Args: | |
| query: Text to look up documents similar to. | |
| k: Number of Documents to return. Defaults to 4. | |
| fetch_k: Number of Documents to fetch to pass to MMR algorithm. | |
| lambda_mult: Number between 0 and 1 that determines the degree | |
| of diversity among the results with 0 corresponding | |
| to maximum diversity and 1 to minimum diversity. | |
| Defaults to 0.5. | |
| Returns: | |
| List of Documents selected by maximal marginal relevance. | |
| """ | |
| query_embedding = self.embedding.embed_query(query) | |
| query_doc = self.doc_cls(embedding=query_embedding) # type: ignore | |
| docs = self.doc_index.find( | |
| query_doc, search_field="embedding", limit=fetch_k | |
| ).documents | |
| mmr_selected = maximal_marginal_relevance( | |
| np.array(query_embedding), docs.embedding, k=k | |
| ) | |
| results = [ | |
| Document(page_content=docs[idx].text, metadata=docs[idx].metadata) | |
| for idx in mmr_selected | |
| ] | |
| return results | |