File size: 648 Bytes
3381d43
 
 
 
 
 
936810c
3381d43
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
from sentence_transformers import SentenceTransformer
from Chunk.chunkingData import chunkData, loadData
def create_vector_database():
    
    model = SentenceTransformer('intfloat/multilingual-e5-large')
    
    documents = loadData()
    chunks = chunkData(documents)
    
    texts = [chunk.page_content for chunk in chunks]
    doc_embedding = model.encode(texts)
    np.save("VectorDatabase/embeddings.npy", doc_embedding)
    np.save("VectorDatabase/texts.npy", texts)
    return model, texts, doc_embedding

if __name__ == "__main__":
    create_vector_database()