Spaces:
Sleeping
Sleeping
| from enum import Enum | |
| import numpy as np | |
| import uuid | |
| from collections import defaultdict | |
| from typing import List, Tuple, Callable | |
| from aimakerspace.openai_utils.embedding import EmbeddingModel | |
| import asyncio | |
| from qdrant_client import models, QdrantClient | |
| from qdrant_client.models import PointStruct,VectorParams,Distance,Batch,VectorStruct,Payload | |
| collection_name = "embedding_collection" | |
| def cosine_similarity(vector_a: np.array, vector_b: np.array) -> float: | |
| """Computes the cosine similarity between two vectors.""" | |
| dot_product = np.dot(vector_a, vector_b) | |
| norm_a = np.linalg.norm(vector_a) | |
| norm_b = np.linalg.norm(vector_b) | |
| return dot_product / (norm_a * norm_b) | |
| def euclidean_distance(vector_a: np.array, vector_b: np.array) -> float: | |
| """Computes the Euclidean distance between two vectors.""" | |
| return np.sqrt(np.sum((vector_a - vector_b) ** 2)) | |
| def manhattan_distance(vector_a: np.array, vector_b: np.array) -> float: | |
| """Computes the Manhattan distance between two vectors.""" | |
| return np.sum(np.abs(vector_a - vector_b)) | |
| def minkowski_distance(vector_a: np.array, vector_b: np.array, p: float) -> float: | |
| """ | |
| Computes the Minkowski distance between two vectors. | |
| Parameters: | |
| vector_a (np.array): First vector. | |
| vector_b (np.array): Second vector. | |
| p (float): The order of the norm. For example, p=1 gives Manhattan distance, p=2 gives Euclidean distance. | |
| Returns: | |
| float: Minkowski distance between vector_a and vector_b. | |
| """ | |
| # Ensure the input vectors are NumPy arrays | |
| vector_a = np.asarray(vector_a) | |
| vector_b = np.asarray(vector_b) | |
| # Compute Minkowski distance | |
| distance = np.sum(np.abs(vector_a - vector_b) ** p) ** (1 / p) | |
| return distance | |
| class DistanceMeasure(Enum): | |
| COSINE_SIMILARITY = cosine_similarity | |
| EUCLIDEAN_DISTANCE = euclidean_distance | |
| MANHATTAN_DISTANCE = manhattan_distance | |
| MINKOWSKI_DISTANCE = minkowski_distance | |
| class VectorDatabaseOptions(Enum): | |
| DICTIONARY = "dictionary" | |
| QDRANT = "qdrant" | |
| class VectorDatabase: | |
| def __init__( | |
| self, | |
| vector_db_options: VectorDatabaseOptions, | |
| embedding_model: EmbeddingModel = None, | |
| ): | |
| self.vectors = None | |
| self.vector_db_options = vector_db_options | |
| self.embedding_model = embedding_model or EmbeddingModel() | |
| if vector_db_options == VectorDatabaseOptions.DICTIONARY: | |
| self.vectors = defaultdict(np.array) | |
| if vector_db_options == VectorDatabaseOptions.QDRANT: | |
| self.qdrant_client = QdrantClient(":memory:") | |
| vector_params = VectorParams( | |
| size=embedding_model.dimensions, # vector size | |
| distance=Distance.COSINE | |
| ) | |
| self.qdrant_client.create_collection( | |
| collection_name=collection_name, | |
| vectors_config={"text": vector_params}, | |
| ) | |
| def insert(self, key: str, vectors: np.array) -> None: | |
| idx = str(uuid.uuid4()) | |
| payload = {"text": key} | |
| point = PointStruct( | |
| id=idx, | |
| vector={"default": vectors.tolist()}, | |
| payload=payload | |
| ) | |
| # Insert the vector into Qdrant with the associated document | |
| self.qdrant_client.upsert( | |
| collection_name=collection_name, | |
| points=[point] | |
| ) | |
| # print(f"Inserted vector with ID {idx}: {vector}") | |
| # self.qdrant_client.upsert( | |
| # collection_name=collection_name, | |
| # points= [ | |
| # [PointStruct( | |
| # id=idx, | |
| # vector=vector, | |
| # payload={"text": key} | |
| # )] | |
| # for idx, vector in enumerate(vectors) | |
| # ]) | |
| # self.qdrant_client.add( | |
| # collection_name=collection_name, | |
| # documents=[key], | |
| # metadata=[], | |
| # ids=str(uuid.uuid4()) | |
| # ) | |
| def search( | |
| self, | |
| query_vector: np.array, | |
| k: int, | |
| distance_measure: Callable = cosine_similarity, | |
| ) -> List[Tuple[str, float]]: | |
| # if isinstance(query_vector, list): | |
| # query_vector = np.array(query_vector) | |
| print(f"Searching in collection: {collection_name} with vector: {query_vector}") | |
| collection_info = self.qdrant_client.get_collection(collection_name) | |
| print(f"Collection info: {collection_info}") | |
| search_results = self.qdrant_client.search( | |
| collection_name=collection_name, | |
| query_vector=('text',query_vector), | |
| limit=k | |
| ) | |
| return [(result.payload['text'], result.score) for result in search_results] | |
| def search_by_text( | |
| self, | |
| query_text: str, | |
| k: int, | |
| distance_measure: Callable = cosine_similarity, | |
| return_as_text: bool = False, | |
| ) -> List[Tuple[str, float]]: | |
| query_vector = self.embedding_model.get_embedding(query_text) | |
| results = self.search(query_vector, k, distance_measure) | |
| return [result[0] for result in results] if return_as_text else results | |
| def retrieve_from_key(self, key: str) -> np.array: | |
| return self.vectors.get(key, None) | |
| async def abuild_from_list(self, list_of_text: List[str]) -> "VectorDatabase": | |
| embeddings = await self.embedding_model.async_get_embeddings(list_of_text) | |
| # vs = VectorStruct() | |
| # VectorStruct = Union[ | |
| # List[StrictFloat], | |
| # List[List[StrictFloat]], | |
| # Dict[StrictStr, Vector], | |
| points = [ | |
| models.PointStruct( | |
| id=str(uuid.uuid4()), | |
| vector={ 'text': embedding}, | |
| payload={ | |
| "text": text | |
| } | |
| ) | |
| for text, embedding in zip(list_of_text, embeddings) | |
| ] | |
| self.qdrant_client.upsert( | |
| collection_name=collection_name, | |
| points=points | |
| ) | |
| return self | |
| if __name__ == "__main__": | |
| list_of_text = [ | |
| "I like to eat broccoli and bananas.", | |
| "I ate a banana and spinach smoothie for breakfast.", | |
| "Chinchillas and kittens are cute.", | |
| "My sister adopted a kitten yesterday.", | |
| "Look at this cute hamster munching on a piece of broccoli.", | |
| ] | |
| vector_db = VectorDatabase() | |
| vector_db = asyncio.run(vector_db.abuild_from_list(list_of_text)) | |
| k = 2 | |
| searched_vector = vector_db.search_by_text("I think fruit is awesome!", k=k) | |
| print(f"Closest {k} vector(s):", searched_vector) | |
| retrieved_vector = vector_db.retrieve_from_key( | |
| "I like to eat broccoli and bananas." | |
| ) | |
| print("Retrieved vector:", retrieved_vector) | |
| relevant_texts = vector_db.search_by_text( | |
| "I think fruit is awesome!", k=k, return_as_text=True | |
| ) | |
| print(f"Closest {k} text(s):", relevant_texts) | |