# -*- coding: utf-8 -*- import argparse, json, os, glob from pathlib import Path import faiss from sentence_transformers import SentenceTransformer import numpy as np def gather_docs(): items=[] # JSON examples/snippets for fp in glob.glob("kb/examples/*.json")+glob.glob("kb/snippets/json/*.json"): try: txt = Path(fp).read_text(encoding="utf-8") items.append(("json", fp, txt)) except: pass # GLM snippets as text for fp in glob.glob("kb/snippets/glm/*"): try: if Path(fp).suffix.lower() in {".glm",".md",""}: txt = Path(fp).read_text(encoding="utf-8") items.append(("glm", fp, txt)) except: pass # cheatsheet + semantics for fp in ["kb/cheatsheets/property_glossary.md","kb/specs/point_semantics.json"]: if Path(fp).exists(): items.append(("text", fp, Path(fp).read_text(encoding="utf-8"))) return items def main(): ap = argparse.ArgumentParser() ap.add_argument("--model", default="sentence-transformers/all-MiniLM-L6-v2") # small + fast ap.add_argument("--outdir", default="rag_index") args = ap.parse_args() os.makedirs(args.outdir, exist_ok=True) docs = gather_docs() if not docs: raise SystemExit("No docs to index.") model = SentenceTransformer(args.model) corpus = [f"[{k}] {p}\n{t}" for (k,p,t) in docs] emb = model.encode(corpus, normalize_embeddings=True, convert_to_numpy=True, show_progress_bar=True) dim = emb.shape[1] index = faiss.IndexFlatIP(dim) index.add(emb) faiss.write_index(index, str(Path(args.outdir, "kb.faiss"))) Path(args.outdir, "kb_meta.json").write_text(json.dumps( {"paths":[p for (_,p,_) in docs], "model": args.model}, indent=2), encoding="utf-8") print("[ok] indexed", len(docs), "docs to", Path(args.outdir,"kb.faiss").resolve()) if __name__ == "__main__": main()