Buckets:

meet4150
/

insurence_model1

Files

xet

meet4150/insurence_model1 / rag /vector_store.py

meet4150

about 1 month ago

download

raw

2.8 kB

	import os
	import re
	from pathlib import Path

	from langchain_community.document_loaders import DirectoryLoader, TextLoader
	from langchain_community.vectorstores import FAISS

	from rag.chunker import chunk_documents
	from rag.embedder import get_embedder

	BASE_DIR = Path(__file__).resolve().parent
	DOCS_PATH = BASE_DIR / "documents"
	INDEX_PATH = Path(os.getenv("FAISS_INDEX_PATH", str(BASE_DIR / "faiss_index")))
	_SECTION_RE = re.compile(r"SECTION\s+\d+\s:\s[^\n]+", re.IGNORECASE)
	_db_cache: FAISS \| None = None


	def _extract_section(text: str) -> str:
	match = _SECTION_RE.search(text)
	return match.group(0).strip() if match else "Unknown section"


	def index_exists() -> bool:
	return (INDEX_PATH / "index.faiss").exists() and (INDEX_PATH / "index.pkl").exists()


	def build_index() -> FAISS:
	"""Load all .txt policy docs, chunk, embed, save FAISS index."""
	loader = DirectoryLoader(str(DOCS_PATH), glob="*.txt", loader_cls=TextLoader)
	docs = loader.load()
	chunks = chunk_documents(docs)

	for idx, chunk in enumerate(chunks):
	source = os.path.basename(chunk.metadata.get("source", "unknown"))
	chunk.metadata["source"] = source
	chunk.metadata["section"] = _extract_section(chunk.page_content)
	chunk.metadata["chunk_id"] = f"{source}:{idx}"

	db = FAISS.from_documents(chunks, get_embedder())
	INDEX_PATH.mkdir(parents=True, exist_ok=True)
	db.save_local(str(INDEX_PATH))
	global _db_cache
	_db_cache = db
	print(f"✅ Index built: {len(chunks)} chunks from {len(docs)} documents")
	return db


	def load_index() -> FAISS:
	global _db_cache
	if _db_cache is not None:
	return _db_cache

	if not index_exists():
	raise FileNotFoundError(
	f"FAISS index not found at '{INDEX_PATH}'. Run build_index() first."
	)
	_db_cache = FAISS.load_local(
	str(INDEX_PATH),
	get_embedder(),
	allow_dangerous_deserialization=True,
	)
	return _db_cache


	def retrieve(query: str, k: int \| None = None) -> list[dict]:
	"""Return top-k chunks with similarity scores and source attribution."""
	k = k or int(os.getenv("TOP_K", 4))
	results = load_index().similarity_search_with_score(query, k=k)
	response = []

	for doc, distance in results:
	similarity = 1 / (1 + float(distance))
	response.append(
	{
	"content": doc.page_content,
	"source": os.path.basename(doc.metadata.get("source", "unknown")),
	"section": doc.metadata.get("section", "Unknown section"),
	"score": round(similarity, 4),
	"distance": round(float(distance), 4),
	"chunk_id": doc.metadata.get("chunk_id", "unknown"),
	}
	)

	return response

Xet Storage Details

Size:: 2.8 kB
Xet hash:: 8acefa75f8ff752aa3313e388e463e141500368108a49de918fdaa504da07e60

Xet efficiently stores files, intelligently splitting them into unique chunks and accelerating uploads and downloads. More info.