Spaces:
Paused
Paused
| import os | |
| import faiss | |
| import pickle | |
| from sentence_transformers import SentenceTransformer | |
| CORPUS_PATH = "../mcma/micro-cyber-llm/rag/corpus/malware_knowledge.txt" | |
| OUT_DIR = "../mcma/micro-cyber-llm/rag/vectorstore" | |
| os.makedirs(OUT_DIR, exist_ok=True) | |
| print("[*] Loading embedding model...") | |
| model = SentenceTransformer("all-MiniLM-L6-v2") | |
| print("[*] Reading corpus...") | |
| with open(CORPUS_PATH, "r", encoding="utf-8") as f: | |
| documents = [line.strip() for line in f if line.strip()] | |
| print(f"[*] Embedding {len(documents)} documents...") | |
| embeddings = model.encode(documents, show_progress_bar=True) | |
| dim = embeddings.shape[1] | |
| index = faiss.IndexFlatL2(dim) | |
| index.add(embeddings) | |
| print("[*] Saving FAISS index...") | |
| faiss.write_index(index, f"{OUT_DIR}/index.faiss") | |
| print("[*] Saving metadata...") | |
| with open(f"{OUT_DIR}/meta.pkl", "wb") as f: | |
| pickle.dump(documents, f) | |
| print("[✓] RAG index built successfully!") | |