import os import faiss import pickle from sentence_transformers import SentenceTransformer CORPUS_PATH = "../mcma/micro-cyber-llm/rag/corpus/malware_knowledge.txt" OUT_DIR = "../mcma/micro-cyber-llm/rag/vectorstore" os.makedirs(OUT_DIR, exist_ok=True) print("[*] Loading embedding model...") model = SentenceTransformer("all-MiniLM-L6-v2") print("[*] Reading corpus...") with open(CORPUS_PATH, "r", encoding="utf-8") as f: documents = [line.strip() for line in f if line.strip()] print(f"[*] Embedding {len(documents)} documents...") embeddings = model.encode(documents, show_progress_bar=True) dim = embeddings.shape[1] index = faiss.IndexFlatL2(dim) index.add(embeddings) print("[*] Saving FAISS index...") faiss.write_index(index, f"{OUT_DIR}/index.faiss") print("[*] Saving metadata...") with open(f"{OUT_DIR}/meta.pkl", "wb") as f: pickle.dump(documents, f) print("[✓] RAG index built successfully!")