File size: 2,225 Bytes
d184dfc
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
import numpy as np
from typing import List, Tuple
from aimakerspace.openai_utils.embedding import EmbeddingModel


class VectorDatabase:
    def __init__(self):
        print("Initializing VectorDatabase...")  # Debug log
        try:
            self.embedding_model = EmbeddingModel()
            print("EmbeddingModel initialized")  # Debug log
            self.embeddings = []
            self.texts = []
        except Exception as e:
            print(f"Error initializing VectorDatabase: {str(e)}")  # Debug log
            raise

    async def abuild_from_list(self, texts: List[str]):
        try:
            print(f"Building vector database from {len(texts)} texts")  # Debug log
            self.texts = texts
            print("Generating embeddings...")  # Debug log
            self.embeddings = await self.embedding_model.async_get_embeddings(texts)
            print(f"Generated {len(self.embeddings)} embeddings")  # Debug log
            return self
        except Exception as e:
            print(f"Error building vector database: {str(e)}")  # Debug log
            raise

    def _cosine_similarity(self, a: List[float], b: List[float]) -> float:
        a = np.array(a)
        b = np.array(b)
        return np.dot(a, b) / (np.linalg.norm(a) * np.linalg.norm(b))

    async def search_by_text(self, query: str, k: int = 4) -> List[Tuple[str, float]]:
        try:
            print(f"Searching for query: {query}")  # Debug log
            query_embedding = await self.embedding_model.embed_query(query)
            print("Generated query embedding")  # Debug log
            
            # Calculate similarities
            similarities = []
            for i, embedding in enumerate(self.embeddings):
                similarity = self._cosine_similarity(query_embedding, embedding)
                similarities.append((self.texts[i], similarity))
            
            # Sort by similarity and return top k
            similarities.sort(key=lambda x: x[1], reverse=True)
            print(f"Found {len(similarities)} matches")  # Debug log
            return similarities[:k]
        except Exception as e:
            print(f"Error in search_by_text: {str(e)}")  # Debug log
            raise