mcma_malware / rag /build_index.py
zeltera's picture
Upload 28 files
cbbe164 verified
raw
history blame contribute delete
957 Bytes
import os
import faiss
import pickle
from sentence_transformers import SentenceTransformer
CORPUS_PATH = "../mcma/micro-cyber-llm/rag/corpus/malware_knowledge.txt"
OUT_DIR = "../mcma/micro-cyber-llm/rag/vectorstore"
os.makedirs(OUT_DIR, exist_ok=True)
print("[*] Loading embedding model...")
model = SentenceTransformer("all-MiniLM-L6-v2")
print("[*] Reading corpus...")
with open(CORPUS_PATH, "r", encoding="utf-8") as f:
documents = [line.strip() for line in f if line.strip()]
print(f"[*] Embedding {len(documents)} documents...")
embeddings = model.encode(documents, show_progress_bar=True)
dim = embeddings.shape[1]
index = faiss.IndexFlatL2(dim)
index.add(embeddings)
print("[*] Saving FAISS index...")
faiss.write_index(index, f"{OUT_DIR}/index.faiss")
print("[*] Saving metadata...")
with open(f"{OUT_DIR}/meta.pkl", "wb") as f:
pickle.dump(documents, f)
print("[✓] RAG index built successfully!")