Spaces:
Sleeping
Sleeping
| \ | |
| import os, glob, json | |
| from pathlib import Path | |
| from sentence_transformers import SentenceTransformer | |
| import faiss | |
| import numpy as np | |
| POLICY_DIR = "policies" | |
| STORE_DIR = "rag_store" | |
| META_PATH = os.path.join(STORE_DIR, "meta.json") | |
| INDEX_PATH = os.path.join(STORE_DIR, "index.faiss") | |
| MODEL_NAME = "sentence-transformers/all-MiniLM-L6-v2" | |
| def read_text_like(path: str) -> str: | |
| if path.lower().endswith((".txt", ".md")): | |
| return Path(path).read_text(encoding="utf-8", errors="ignore") | |
| return "" | |
| def chunk(text: str, size=800, overlap=100): | |
| i = 0 | |
| n = len(text) | |
| while i < n: | |
| yield text[i : i + size] | |
| i += size - overlap | |
| def main(): | |
| os.makedirs(STORE_DIR, exist_ok=True) | |
| files = sorted( | |
| [p for p in glob.glob(os.path.join(POLICY_DIR, "**", "*"), recursive=True) | |
| if os.path.isfile(p)] | |
| ) | |
| docs = [] | |
| for fp in files: | |
| txt = read_text_like(fp) | |
| if not txt.strip(): | |
| continue | |
| for ch in chunk(txt): | |
| docs.append({"text": ch, "source": os.path.relpath(fp)}) | |
| if not docs: | |
| raise SystemExit(f"No .txt/.md files found in '{POLICY_DIR}/'") | |
| model = SentenceTransformer(MODEL_NAME) | |
| texts = [d["text"] for d in docs] | |
| embs = model.encode(texts, convert_to_numpy=True, normalize_embeddings=True) | |
| index = faiss.IndexFlatIP(embs.shape[1]) | |
| index.add(embs.astype(np.float32)) | |
| faiss.write_index(index, INDEX_PATH) | |
| with open(META_PATH, "w", encoding="utf-8") as f: | |
| json.dump({"model": MODEL_NAME, "docs": docs}, f, ensure_ascii=False) | |
| print(f"Indexed {len(docs)} chunks from {len(files)} files → {INDEX_PATH}") | |
| if __name__ == "__main__": | |
| main() | |