Spaces:

VEDAGI1
/

Medica_DecisionSupportAI

Sleeping

Medica_DecisionSupportAI / build_policy_index.py

Rajan Sharma

Upload 14 files

f051f2e verified 3 months ago

1.71 kB

	\
	import os, glob, json
	from pathlib import Path
	from sentence_transformers import SentenceTransformer
	import faiss
	import numpy as np

	POLICY_DIR = "policies"
	STORE_DIR = "rag_store"
	META_PATH = os.path.join(STORE_DIR, "meta.json")
	INDEX_PATH = os.path.join(STORE_DIR, "index.faiss")
	MODEL_NAME = "sentence-transformers/all-MiniLM-L6-v2"

	def read_text_like(path: str) -> str:
	if path.lower().endswith((".txt", ".md")):
	return Path(path).read_text(encoding="utf-8", errors="ignore")
	return ""

	def chunk(text: str, size=800, overlap=100):
	i = 0
	n = len(text)
	while i < n:
	yield text[i : i + size]
	i += size - overlap

	def main():
	os.makedirs(STORE_DIR, exist_ok=True)
	files = sorted(
	[p for p in glob.glob(os.path.join(POLICY_DIR, "*", ""), recursive=True)
	if os.path.isfile(p)]
	)
	docs = []
	for fp in files:
	txt = read_text_like(fp)
	if not txt.strip():
	continue
	for ch in chunk(txt):
	docs.append({"text": ch, "source": os.path.relpath(fp)})

	if not docs:
	raise SystemExit(f"No .txt/.md files found in '{POLICY_DIR}/'")

	model = SentenceTransformer(MODEL_NAME)
	texts = [d["text"] for d in docs]
	embs = model.encode(texts, convert_to_numpy=True, normalize_embeddings=True)

	index = faiss.IndexFlatIP(embs.shape[1])
	index.add(embs.astype(np.float32))

	faiss.write_index(index, INDEX_PATH)
	with open(META_PATH, "w", encoding="utf-8") as f:
	json.dump({"model": MODEL_NAME, "docs": docs}, f, ensure_ascii=False)

	print(f"Indexed {len(docs)} chunks from {len(files)} files → {INDEX_PATH}")

	if __name__ == "__main__":
	main()