Spaces:
Sleeping
Sleeping
| from typing import List, Dict, Union, Any | |
| import numpy as np | |
| from annoy import AnnoyIndex | |
| from .vector_store import VectorStore | |
| class AnnoyDB(VectorStore): | |
| def __init__( | |
| self, | |
| embedding_dim: int, | |
| metric: str = 'angular' | |
| ) -> None: | |
| self.documents = [] | |
| self.metadata = [] | |
| self.embedding_dim = embedding_dim | |
| self.index = AnnoyIndex(embedding_dim, metric) | |
| self.index_built = False | |
| def add_document(self, text: str, metadata: Dict[str, Any] = None): | |
| """ | |
| Add a document to the search index. | |
| Args: | |
| text: The document text | |
| metadata: Optional metadata about the document | |
| """ | |
| self.documents.append(text) | |
| self.metadata.append(metadata or {}) | |
| # Generate embedding using Sentence Transformers | |
| embedding = self.model.encode(text, show_progress_bar=False) | |
| # Add to Annoy index | |
| index_id = len(self.documents) - 1 | |
| self.index.add_item(index_id, embedding) | |
| self.index_built = False | |
| def add_documents(self, texts: List[str], embeddings: np.array, metadata_list: List[Dict[str, Any]] = None): | |
| """ | |
| Batch add documents to the search index. | |
| Args: | |
| texts: List of document texts | |
| metadata_list: Optional list of metadata dictionaries | |
| """ | |
| if metadata_list is None: | |
| metadata_list = [{} for _ in texts] | |
| # Add documents and embeddings | |
| print("Adding to index...") | |
| for i, (text, metadata, embedding) in enumerate(zip(texts, metadata_list, embeddings)): | |
| self.documents.append(text) | |
| self.metadata.append(metadata) | |
| self.index.add_item(len(self.documents) - 1, embedding) | |
| self.index_built = False | |
| print("Done") | |
| def add_data(self, embedding: np.ndarray, document: str): | |
| item_id = len(self.documents) | |
| self.index.add_item(item_id, embedding) | |
| self.documents.append(document) | |
| def build(self, num_trees:int = 10): | |
| self.index.build(num_trees) | |
| def save(self, filepath: str): | |
| self.index.save(filepath) | |
| def load(self, filepath: str): | |
| self.index.load(filepath) | |
| def search(self, query_embedding: np.ndarray, top_k: int = 5) -> List[Dict[str, Union[str, float]]]: | |
| indices, distances = self.index.get_nns_by_vector( | |
| query_embedding, top_k, include_distances=True | |
| ) | |
| results = [ | |
| { | |
| "document": self.documents[idx], | |
| "score": 1 / (1 + distance) # Convert distance to similarity | |
| } for idx, distance in zip(indices, distances) | |
| ] | |
| return results | |