Spaces:
Paused
Paused
| from abc import ABC, abstractmethod | |
| from collections.abc import Sequence | |
| from typing import Any, Optional | |
| from pydantic import BaseModel, Field | |
| class Document(BaseModel): | |
| """Class for storing a piece of text and associated metadata.""" | |
| page_content: str | |
| vector: Optional[list[float]] = None | |
| """Arbitrary metadata about the page content (e.g., source, relationships to other | |
| documents, etc.). | |
| """ | |
| metadata: Optional[dict] = Field(default_factory=dict) | |
| provider: Optional[str] = "dify" | |
| class BaseDocumentTransformer(ABC): | |
| """Abstract base class for document transformation systems. | |
| A document transformation system takes a sequence of Documents and returns a | |
| sequence of transformed Documents. | |
| Example: | |
| .. code-block:: python | |
| class EmbeddingsRedundantFilter(BaseDocumentTransformer, BaseModel): | |
| embeddings: Embeddings | |
| similarity_fn: Callable = cosine_similarity | |
| similarity_threshold: float = 0.95 | |
| class Config: | |
| arbitrary_types_allowed = True | |
| def transform_documents( | |
| self, documents: Sequence[Document], **kwargs: Any | |
| ) -> Sequence[Document]: | |
| stateful_documents = get_stateful_documents(documents) | |
| embedded_documents = _get_embeddings_from_stateful_docs( | |
| self.embeddings, stateful_documents | |
| ) | |
| included_idxs = _filter_similar_embeddings( | |
| embedded_documents, self.similarity_fn, self.similarity_threshold | |
| ) | |
| return [stateful_documents[i] for i in sorted(included_idxs)] | |
| async def atransform_documents( | |
| self, documents: Sequence[Document], **kwargs: Any | |
| ) -> Sequence[Document]: | |
| raise NotImplementedError | |
| """ | |
| def transform_documents(self, documents: Sequence[Document], **kwargs: Any) -> Sequence[Document]: | |
| """Transform a list of documents. | |
| Args: | |
| documents: A sequence of Documents to be transformed. | |
| Returns: | |
| A list of transformed Documents. | |
| """ | |
| async def atransform_documents(self, documents: Sequence[Document], **kwargs: Any) -> Sequence[Document]: | |
| """Asynchronously transform a list of documents. | |
| Args: | |
| documents: A sequence of Documents to be transformed. | |
| Returns: | |
| A list of transformed Documents. | |
| """ | |