File size: 1,640 Bytes
2be9eb9
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
from typing import Callable

import numpy as np

from utils.embedding import EmbeddingModel
from utils.metric import CosineSimilarity, DistanceMetric
from utils.vector import Vector


class VectorDatabase:
    def __init__(self, model: EmbeddingModel) -> None:
        self.vectors: dict[str, Vector] = {}
        self.model = model

    def insert(self, vector: Vector) -> None:
        self.vectors[vector.key] = vector

    def search(
        self,
        query_vector: np.array,
        k: int,
        min_quality: float,
        distance_measure: Callable,
    ) -> list[Vector]:
        scores = []
        for vector in self.vectors.values():
            similarity = distance_measure(query_vector, vector.data)
            if similarity < min_quality:
                continue
            scores.append((vector, similarity))
        return sorted(scores, key=lambda i: i[1], reverse=True)[:k]

    async def asearch_by_text(
        self,
        query_text: str,
        k: int = 5,
        min_quality: float = 0.5,
        metric_class: DistanceMetric = CosineSimilarity,
        text_only: bool = False,
    ) -> list[tuple[str, float]]:
        query_vector = await self.model.aget_embedding(query_text)
        response = self.search(query_vector, k, min_quality, metric_class.count)
        return [item[0].key for item in response] if text_only else response

    async def abuild_from_list(self, texts: list[str], metadata: dict) -> None:
        embeddings = await self.model.aget_embeddings(texts)
        for item, emb in zip(texts, embeddings):
            self.insert(Vector(data=emb, key=item, metadata=metadata))