meet4150/insurence_model1 / rag /vector_store.py
meet4150's picture
download
raw
2.8 kB
import os
import re
from pathlib import Path
from langchain_community.document_loaders import DirectoryLoader, TextLoader
from langchain_community.vectorstores import FAISS
from rag.chunker import chunk_documents
from rag.embedder import get_embedder
BASE_DIR = Path(__file__).resolve().parent
DOCS_PATH = BASE_DIR / "documents"
INDEX_PATH = Path(os.getenv("FAISS_INDEX_PATH", str(BASE_DIR / "faiss_index")))
_SECTION_RE = re.compile(r"SECTION\s+\d+\s*:\s*[^\n]+", re.IGNORECASE)
_db_cache: FAISS | None = None
def _extract_section(text: str) -> str:
match = _SECTION_RE.search(text)
return match.group(0).strip() if match else "Unknown section"
def index_exists() -> bool:
return (INDEX_PATH / "index.faiss").exists() and (INDEX_PATH / "index.pkl").exists()
def build_index() -> FAISS:
"""Load all .txt policy docs, chunk, embed, save FAISS index."""
loader = DirectoryLoader(str(DOCS_PATH), glob="*.txt", loader_cls=TextLoader)
docs = loader.load()
chunks = chunk_documents(docs)
for idx, chunk in enumerate(chunks):
source = os.path.basename(chunk.metadata.get("source", "unknown"))
chunk.metadata["source"] = source
chunk.metadata["section"] = _extract_section(chunk.page_content)
chunk.metadata["chunk_id"] = f"{source}:{idx}"
db = FAISS.from_documents(chunks, get_embedder())
INDEX_PATH.mkdir(parents=True, exist_ok=True)
db.save_local(str(INDEX_PATH))
global _db_cache
_db_cache = db
print(f"✅ Index built: {len(chunks)} chunks from {len(docs)} documents")
return db
def load_index() -> FAISS:
global _db_cache
if _db_cache is not None:
return _db_cache
if not index_exists():
raise FileNotFoundError(
f"FAISS index not found at '{INDEX_PATH}'. Run build_index() first."
)
_db_cache = FAISS.load_local(
str(INDEX_PATH),
get_embedder(),
allow_dangerous_deserialization=True,
)
return _db_cache
def retrieve(query: str, k: int | None = None) -> list[dict]:
"""Return top-k chunks with similarity scores and source attribution."""
k = k or int(os.getenv("TOP_K", 4))
results = load_index().similarity_search_with_score(query, k=k)
response = []
for doc, distance in results:
similarity = 1 / (1 + float(distance))
response.append(
{
"content": doc.page_content,
"source": os.path.basename(doc.metadata.get("source", "unknown")),
"section": doc.metadata.get("section", "Unknown section"),
"score": round(similarity, 4),
"distance": round(float(distance), 4),
"chunk_id": doc.metadata.get("chunk_id", "unknown"),
}
)
return response

Xet Storage Details

Size:
2.8 kB
·
Xet hash:
8acefa75f8ff752aa3313e388e463e141500368108a49de918fdaa504da07e60

Xet efficiently stores files, intelligently splitting them into unique chunks and accelerating uploads and downloads. More info.