Buckets:
| import os | |
| import re | |
| from pathlib import Path | |
| from langchain_community.document_loaders import DirectoryLoader, TextLoader | |
| from langchain_community.vectorstores import FAISS | |
| from rag.chunker import chunk_documents | |
| from rag.embedder import get_embedder | |
| BASE_DIR = Path(__file__).resolve().parent | |
| DOCS_PATH = BASE_DIR / "documents" | |
| INDEX_PATH = Path(os.getenv("FAISS_INDEX_PATH", str(BASE_DIR / "faiss_index"))) | |
| _SECTION_RE = re.compile(r"SECTION\s+\d+\s*:\s*[^\n]+", re.IGNORECASE) | |
| _db_cache: FAISS | None = None | |
| def _extract_section(text: str) -> str: | |
| match = _SECTION_RE.search(text) | |
| return match.group(0).strip() if match else "Unknown section" | |
| def index_exists() -> bool: | |
| return (INDEX_PATH / "index.faiss").exists() and (INDEX_PATH / "index.pkl").exists() | |
| def build_index() -> FAISS: | |
| """Load all .txt policy docs, chunk, embed, save FAISS index.""" | |
| loader = DirectoryLoader(str(DOCS_PATH), glob="*.txt", loader_cls=TextLoader) | |
| docs = loader.load() | |
| chunks = chunk_documents(docs) | |
| for idx, chunk in enumerate(chunks): | |
| source = os.path.basename(chunk.metadata.get("source", "unknown")) | |
| chunk.metadata["source"] = source | |
| chunk.metadata["section"] = _extract_section(chunk.page_content) | |
| chunk.metadata["chunk_id"] = f"{source}:{idx}" | |
| db = FAISS.from_documents(chunks, get_embedder()) | |
| INDEX_PATH.mkdir(parents=True, exist_ok=True) | |
| db.save_local(str(INDEX_PATH)) | |
| global _db_cache | |
| _db_cache = db | |
| print(f"✅ Index built: {len(chunks)} chunks from {len(docs)} documents") | |
| return db | |
| def load_index() -> FAISS: | |
| global _db_cache | |
| if _db_cache is not None: | |
| return _db_cache | |
| if not index_exists(): | |
| raise FileNotFoundError( | |
| f"FAISS index not found at '{INDEX_PATH}'. Run build_index() first." | |
| ) | |
| _db_cache = FAISS.load_local( | |
| str(INDEX_PATH), | |
| get_embedder(), | |
| allow_dangerous_deserialization=True, | |
| ) | |
| return _db_cache | |
| def retrieve(query: str, k: int | None = None) -> list[dict]: | |
| """Return top-k chunks with similarity scores and source attribution.""" | |
| k = k or int(os.getenv("TOP_K", 4)) | |
| results = load_index().similarity_search_with_score(query, k=k) | |
| response = [] | |
| for doc, distance in results: | |
| similarity = 1 / (1 + float(distance)) | |
| response.append( | |
| { | |
| "content": doc.page_content, | |
| "source": os.path.basename(doc.metadata.get("source", "unknown")), | |
| "section": doc.metadata.get("section", "Unknown section"), | |
| "score": round(similarity, 4), | |
| "distance": round(float(distance), 4), | |
| "chunk_id": doc.metadata.get("chunk_id", "unknown"), | |
| } | |
| ) | |
| return response | |
Xet Storage Details
- Size:
- 2.8 kB
- Xet hash:
- 8acefa75f8ff752aa3313e388e463e141500368108a49de918fdaa504da07e60
·
Xet efficiently stores files, intelligently splitting them into unique chunks and accelerating uploads and downloads. More info.