| |
|
| | import argparse, json, os, glob
|
| | from pathlib import Path
|
| | import faiss
|
| | from sentence_transformers import SentenceTransformer
|
| | import numpy as np
|
| |
|
| | def gather_docs():
|
| | items=[]
|
| |
|
| | for fp in glob.glob("kb/examples/*.json")+glob.glob("kb/snippets/json/*.json"):
|
| | try:
|
| | txt = Path(fp).read_text(encoding="utf-8")
|
| | items.append(("json", fp, txt))
|
| | except: pass
|
| |
|
| | for fp in glob.glob("kb/snippets/glm/*"):
|
| | try:
|
| | if Path(fp).suffix.lower() in {".glm",".md",""}:
|
| | txt = Path(fp).read_text(encoding="utf-8")
|
| | items.append(("glm", fp, txt))
|
| | except: pass
|
| |
|
| | for fp in ["kb/cheatsheets/property_glossary.md","kb/specs/point_semantics.json"]:
|
| | if Path(fp).exists():
|
| | items.append(("text", fp, Path(fp).read_text(encoding="utf-8")))
|
| | return items
|
| |
|
| | def main():
|
| | ap = argparse.ArgumentParser()
|
| | ap.add_argument("--model", default="sentence-transformers/all-MiniLM-L6-v2")
|
| | ap.add_argument("--outdir", default="rag_index")
|
| | args = ap.parse_args()
|
| |
|
| | os.makedirs(args.outdir, exist_ok=True)
|
| | docs = gather_docs()
|
| | if not docs: raise SystemExit("No docs to index.")
|
| | model = SentenceTransformer(args.model)
|
| | corpus = [f"[{k}] {p}\n{t}" for (k,p,t) in docs]
|
| | emb = model.encode(corpus, normalize_embeddings=True, convert_to_numpy=True, show_progress_bar=True)
|
| | dim = emb.shape[1]
|
| | index = faiss.IndexFlatIP(dim)
|
| | index.add(emb)
|
| | faiss.write_index(index, str(Path(args.outdir, "kb.faiss")))
|
| | Path(args.outdir, "kb_meta.json").write_text(json.dumps(
|
| | {"paths":[p for (_,p,_) in docs], "model": args.model}, indent=2), encoding="utf-8")
|
| | print("[ok] indexed", len(docs), "docs to", Path(args.outdir,"kb.faiss").resolve())
|
| |
|
| | if __name__ == "__main__":
|
| | main()
|
| |
|