Spaces:
Sleeping
Sleeping
File size: 1,434 Bytes
6181bfa | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 | from sentence_transformers import SentenceTransformer
import numpy as np
import faiss
import json
global dua_data
with open("json/dua_keys.json",'rb') as fl:
dua_data = json.load(fl)
keys2 = [i for i in dua_data.keys()]
model = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2",cache_folder="./all-MiniLM-L6-v2")
embeddings = model.encode(keys2)
dimension = embeddings.shape[1]
embeddings = embeddings / np.linalg.norm(embeddings, axis=1, keepdims=True)
Index = faiss.IndexFlatIP(dimension)
Index.add(embeddings) # type: ignore
faiss.write_index(Index,"./model/dua_vector.faiss")
"""
USAGE
def search(question,top_k=1):
question_embeddings = model.encode([question])
question_embeddings = question_embeddings / np.linalg.norm(question_embeddings,axis=1,keepdims=True)
distances,indices = Index.search(np.array(question_embeddings),top_k)
results = [(keys2[i], distances[0][pos]) for pos,i in enumerate(indices[0])]
return results
question = input("Question: ")
print(search(question))
"""
"""
# I used this to generate the new keys
import json
with open("json/dua_dataset.json",'rb') as fl:
dua_data = json.load(fl)
new_dataset = {}
for data in dua_data:
id = data["id"]
text = data["full_text"]
data = {text : id}
new_dataset.update(data)
with open("json/dua_keys.json",'w') as fl:
json.dump(new_dataset,fl)
"""
|