File size: 1,434 Bytes
6181bfa
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
from sentence_transformers import SentenceTransformer
import numpy as np
import faiss
import json


global dua_data
with open("json/dua_keys.json",'rb') as fl:
    dua_data = json.load(fl)

keys2 = [i for i in dua_data.keys()]

model = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2",cache_folder="./all-MiniLM-L6-v2")
embeddings = model.encode(keys2)

dimension = embeddings.shape[1]

embeddings =  embeddings / np.linalg.norm(embeddings, axis=1, keepdims=True)
Index = faiss.IndexFlatIP(dimension)
Index.add(embeddings) # type: ignore


faiss.write_index(Index,"./model/dua_vector.faiss")

"""
USAGE
def search(question,top_k=1):
    question_embeddings = model.encode([question])
    question_embeddings = question_embeddings / np.linalg.norm(question_embeddings,axis=1,keepdims=True)
    distances,indices = Index.search(np.array(question_embeddings),top_k)
    results = [(keys2[i], distances[0][pos]) for pos,i in enumerate(indices[0])]
    return results

question = input("Question: ")
print(search(question))
"""



"""

# I used this to generate the new keys
import json


with open("json/dua_dataset.json",'rb') as fl:
    dua_data = json.load(fl)

    new_dataset = {}

    for data in dua_data:
        id = data["id"]
        text = data["full_text"]
        data = {text : id}
        new_dataset.update(data)
        
    with open("json/dua_keys.json",'w') as fl:
         json.dump(new_dataset,fl)

"""