ds6b-attackplan-qlora / scripts /build_rag_index.py
adetuire1's picture
Upload folder using huggingface_hub
fba140f verified
# -*- coding: utf-8 -*-
import argparse, json, os, glob
from pathlib import Path
import faiss
from sentence_transformers import SentenceTransformer
import numpy as np
def gather_docs():
items=[]
# JSON examples/snippets
for fp in glob.glob("kb/examples/*.json")+glob.glob("kb/snippets/json/*.json"):
try:
txt = Path(fp).read_text(encoding="utf-8")
items.append(("json", fp, txt))
except: pass
# GLM snippets as text
for fp in glob.glob("kb/snippets/glm/*"):
try:
if Path(fp).suffix.lower() in {".glm",".md",""}:
txt = Path(fp).read_text(encoding="utf-8")
items.append(("glm", fp, txt))
except: pass
# cheatsheet + semantics
for fp in ["kb/cheatsheets/property_glossary.md","kb/specs/point_semantics.json"]:
if Path(fp).exists():
items.append(("text", fp, Path(fp).read_text(encoding="utf-8")))
return items
def main():
ap = argparse.ArgumentParser()
ap.add_argument("--model", default="sentence-transformers/all-MiniLM-L6-v2") # small + fast
ap.add_argument("--outdir", default="rag_index")
args = ap.parse_args()
os.makedirs(args.outdir, exist_ok=True)
docs = gather_docs()
if not docs: raise SystemExit("No docs to index.")
model = SentenceTransformer(args.model)
corpus = [f"[{k}] {p}\n{t}" for (k,p,t) in docs]
emb = model.encode(corpus, normalize_embeddings=True, convert_to_numpy=True, show_progress_bar=True)
dim = emb.shape[1]
index = faiss.IndexFlatIP(dim)
index.add(emb)
faiss.write_index(index, str(Path(args.outdir, "kb.faiss")))
Path(args.outdir, "kb_meta.json").write_text(json.dumps(
{"paths":[p for (_,p,_) in docs], "model": args.model}, indent=2), encoding="utf-8")
print("[ok] indexed", len(docs), "docs to", Path(args.outdir,"kb.faiss").resolve())
if __name__ == "__main__":
main()