CppSensei / embeddings.py
CUDA28's picture
Initial clean deploy of CppSensei
556c1d7
raw
history blame contribute delete
864 Bytes
from sentence_transformers import SentenceTransformer
import faiss
import numpy as np
import os
# Load embedding model
print("Loading embedding model...")
model = SentenceTransformer("all-MiniLM-L6-v2")
# Load training data
data_path = "data/data.txt"
with open(data_path, "r", encoding="utf-8") as f:
documents = [line.strip() for line in f.readlines() if line.strip()]
print(f"Loaded {len(documents)} documents")
# Create embeddings
print("Creating embeddings...")
embeddings = model.encode(documents)
# Create FAISS index
dimension = embeddings.shape[1]
index = faiss.IndexFlatL2(dimension)
index.add(np.array(embeddings))
# Save index and documents
faiss.write_index(index, "vector.index")
with open("documents.txt", "w", encoding="utf-8") as f:
for doc in documents:
f.write(doc + "\n")
print("Vector database created successfully!")