Spaces:

VEDAGI1
/

Medica_DecisionSupportAI

Sleeping

App Files Files Community

Rajan Sharma commited on Sep 5

Commit

b1c2b18

verified ·

1 Parent(s): 37ae898

Create build_policy_index.py

Browse files

Files changed (1) hide show

build_policy_index.py +58 -0

build_policy_index.py ADDED Viewed

	@@ -0,0 +1,58 @@

+# build_policy_index.py
+import os, glob, json
+from pathlib import Path
+from sentence_transformers import SentenceTransformer
+import faiss
+import numpy as np
+POLICY_DIR = "policies"
+STORE_DIR = "rag_store"
+META_PATH = os.path.join(STORE_DIR, "meta.json")
+INDEX_PATH = os.path.join(STORE_DIR, "index.faiss")
+MODEL_NAME = "sentence-transformers/all-MiniLM-L6-v2"
+def read_text_like(path: str) -> str:
+    # Keep it simple: .txt / .md only to avoid extra deps
+    if path.lower().endswith((".txt", ".md")):
+        return Path(path).read_text(encoding="utf-8", errors="ignore")
+    return ""
+def chunk(text: str, size=800, overlap=100):
+    i = 0
+    n = len(text)
+    while i < n:
+        yield text[i : i + size]
+        i += size - overlap
+def main():
+    os.makedirs(STORE_DIR, exist_ok=True)
+    files = sorted(
+        [p for p in glob.glob(os.path.join(POLICY_DIR, "**", "*"), recursive=True)
+         if os.path.isfile(p)]
+    )
+    docs = []
+    for fp in files:
+        txt = read_text_like(fp)
+        if not txt.strip():
+            continue
+        for ch in chunk(txt):
+            docs.append({"text": ch, "source": os.path.relpath(fp)})
+    if not docs:
+        raise SystemExit(f"No .txt/.md files found in '{POLICY_DIR}/'")
+    model = SentenceTransformer(MODEL_NAME)
+    texts = [d["text"] for d in docs]
+    embs = model.encode(texts, convert_to_numpy=True, normalize_embeddings=True)
+    index = faiss.IndexFlatIP(embs.shape[1])
+    index.add(embs.astype(np.float32))
+    faiss.write_index(index, INDEX_PATH)
+    with open(META_PATH, "w", encoding="utf-8") as f:
+        json.dump({"model": MODEL_NAME, "docs": docs}, f, ensure_ascii=False)
+    print(f"Indexed {len(docs)} chunks from {len(files)} files → {INDEX_PATH}")
+if __name__ == "__main__":
+    main()