File size: 3,021 Bytes
0419215
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
91d064c
0419215
 
 
 
 
 
 
 
91d064c
 
 
0419215
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
from sentence_transformers import SentenceTransformer
import numpy as np

model = SentenceTransformer("all-MiniLM-L6-v2", cache_folder=".cache")
dim = model.get_sentence_embedding_dimension() 

def embedd(documents):
    return model.encode(documents, convert_to_numpy=True)

class VectorStore:
    def __init__(self, embeddings):
        pass

    def add(self, documents):
        pass

    def search(self, query_vector, top_k=5):
        pass


import faiss
import json
class FaissVectorStore(VectorStore):
    def __init__(self):
        self.index = None
        self.texts = []
        self.metadata = []

    def add(self, items):  # items: list of dicts with 'raw_text' and meta
        raw_texts = [x["raw_text"] for x in items]

        with open('test.json',  'w') as f:
            json.dump(raw_texts, f, indent=2)
        metas = [{k: v for k, v in x.items() if k != "raw_text"} for x in items]
        
        vectors = model.encode(raw_texts, convert_to_numpy=True).astype('float32')
        if self.index is None:
            d = vectors.shape[1]
            self.index = faiss.IndexFlatL2(d)
        self.index.add(vectors)

        self.texts.extend(raw_texts)
        self.metadata.extend(metas)

    def search(self, queries, k=3):
        q_vecs = model.encode(queries, convert_to_numpy=True).astype('float32')
        D, I = self.index.search(q_vecs, k)
        results = []
        for i in range(len(queries)):
            results.append([
                {
                    "score": 1 - D[i][j] / 2,
                    "text": self.texts[I[i][j]],
                    "meta": self.metadata[I[i][j]]
                }
                for j in range(k)
            ])
        return results



from annoy import AnnoyIndex
class AnnoyVectorStore(VectorStore):
    def __init__(self, n_trees=10):
        self.index = None
        self.texts = []
        self.metadata = []
        self.n_trees = n_trees

    def add(self, items):  # items: list of dicts
        raw_texts = [x["raw_text"] for x in items]
        metas = [{k: v for k, v in x.items() if k != "raw_text"} for x in items]
        vectors = model.encode(raw_texts, convert_to_numpy=True)
        if self.index is None:
            self.index = AnnoyIndex(dim, 'angular')
        start = len(self.texts)
        for i, vec in enumerate(vectors):
            self.index.add_item(start + i, vec)
        self.texts.extend(raw_texts)
        self.metadata.extend(metas)
        self.index.build(self.n_trees)

    def search(self, queries, k=3):
        q_vecs = model.encode(queries, convert_to_numpy=True)
        results = []
        for q in q_vecs:
            idxs, dists = self.index.get_nns_by_vector(q, k, include_distances=True)
            results.append([
                {
                    "score": 1 - dists[i] / 2,
                    "text": self.texts[idxs[i]],
                    "meta": self.metadata[idxs[i]]
                }
                for i in range(len(idxs))
            ])
        return results