File size: 1,985 Bytes
fba140f | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 | # -*- coding: utf-8 -*-
import argparse, json, os, glob
from pathlib import Path
import faiss
from sentence_transformers import SentenceTransformer
import numpy as np
def gather_docs():
items=[]
# JSON examples/snippets
for fp in glob.glob("kb/examples/*.json")+glob.glob("kb/snippets/json/*.json"):
try:
txt = Path(fp).read_text(encoding="utf-8")
items.append(("json", fp, txt))
except: pass
# GLM snippets as text
for fp in glob.glob("kb/snippets/glm/*"):
try:
if Path(fp).suffix.lower() in {".glm",".md",""}:
txt = Path(fp).read_text(encoding="utf-8")
items.append(("glm", fp, txt))
except: pass
# cheatsheet + semantics
for fp in ["kb/cheatsheets/property_glossary.md","kb/specs/point_semantics.json"]:
if Path(fp).exists():
items.append(("text", fp, Path(fp).read_text(encoding="utf-8")))
return items
def main():
ap = argparse.ArgumentParser()
ap.add_argument("--model", default="sentence-transformers/all-MiniLM-L6-v2") # small + fast
ap.add_argument("--outdir", default="rag_index")
args = ap.parse_args()
os.makedirs(args.outdir, exist_ok=True)
docs = gather_docs()
if not docs: raise SystemExit("No docs to index.")
model = SentenceTransformer(args.model)
corpus = [f"[{k}] {p}\n{t}" for (k,p,t) in docs]
emb = model.encode(corpus, normalize_embeddings=True, convert_to_numpy=True, show_progress_bar=True)
dim = emb.shape[1]
index = faiss.IndexFlatIP(dim)
index.add(emb)
faiss.write_index(index, str(Path(args.outdir, "kb.faiss")))
Path(args.outdir, "kb_meta.json").write_text(json.dumps(
{"paths":[p for (_,p,_) in docs], "model": args.model}, indent=2), encoding="utf-8")
print("[ok] indexed", len(docs), "docs to", Path(args.outdir,"kb.faiss").resolve())
if __name__ == "__main__":
main()
|