Rajan Sharma commited on
Commit
b1c2b18
·
verified ·
1 Parent(s): 37ae898

Create build_policy_index.py

Browse files
Files changed (1) hide show
  1. build_policy_index.py +58 -0
build_policy_index.py ADDED
@@ -0,0 +1,58 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # build_policy_index.py
2
+ import os, glob, json
3
+ from pathlib import Path
4
+ from sentence_transformers import SentenceTransformer
5
+ import faiss
6
+ import numpy as np
7
+
8
+ POLICY_DIR = "policies"
9
+ STORE_DIR = "rag_store"
10
+ META_PATH = os.path.join(STORE_DIR, "meta.json")
11
+ INDEX_PATH = os.path.join(STORE_DIR, "index.faiss")
12
+ MODEL_NAME = "sentence-transformers/all-MiniLM-L6-v2"
13
+
14
+ def read_text_like(path: str) -> str:
15
+ # Keep it simple: .txt / .md only to avoid extra deps
16
+ if path.lower().endswith((".txt", ".md")):
17
+ return Path(path).read_text(encoding="utf-8", errors="ignore")
18
+ return ""
19
+
20
+ def chunk(text: str, size=800, overlap=100):
21
+ i = 0
22
+ n = len(text)
23
+ while i < n:
24
+ yield text[i : i + size]
25
+ i += size - overlap
26
+
27
+ def main():
28
+ os.makedirs(STORE_DIR, exist_ok=True)
29
+ files = sorted(
30
+ [p for p in glob.glob(os.path.join(POLICY_DIR, "**", "*"), recursive=True)
31
+ if os.path.isfile(p)]
32
+ )
33
+ docs = []
34
+ for fp in files:
35
+ txt = read_text_like(fp)
36
+ if not txt.strip():
37
+ continue
38
+ for ch in chunk(txt):
39
+ docs.append({"text": ch, "source": os.path.relpath(fp)})
40
+
41
+ if not docs:
42
+ raise SystemExit(f"No .txt/.md files found in '{POLICY_DIR}/'")
43
+
44
+ model = SentenceTransformer(MODEL_NAME)
45
+ texts = [d["text"] for d in docs]
46
+ embs = model.encode(texts, convert_to_numpy=True, normalize_embeddings=True)
47
+
48
+ index = faiss.IndexFlatIP(embs.shape[1])
49
+ index.add(embs.astype(np.float32))
50
+
51
+ faiss.write_index(index, INDEX_PATH)
52
+ with open(META_PATH, "w", encoding="utf-8") as f:
53
+ json.dump({"model": MODEL_NAME, "docs": docs}, f, ensure_ascii=False)
54
+
55
+ print(f"Indexed {len(docs)} chunks from {len(files)} files → {INDEX_PATH}")
56
+
57
+ if __name__ == "__main__":
58
+ main()