AgentBench / tools /vector_store.py
Adityax-07's picture
feat: add core LangGraph multi-agent pipeline
be745f3
Raw
History Blame Contribute Delete
5.19 kB
"""
FAISS Vector Store Tool for RAG Applications
--------------------------------------------
This file provides a complete pipeline to:
1. Convert raw documents → embeddings
2. Build and persist a FAISS vector database
3. Load existing vector database
4. Retrieve semantically relevant chunks for LLM context
Designed for LangChain / AI Agent workflows.
"""
# =========================
# Imports
# =========================
from langchain_community.vectorstores import FAISS
from langchain_huggingface import HuggingFaceEmbeddings
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_core.documents import Document
from typing import List, Union
import os
# =========================
# Global Embedding Model
# =========================
# WHY HuggingFace instead of OpenAI?
# Runs locally — no API key, no cost, no network call for embeddings.
# all-MiniLM-L6-v2 is fast, small, and accurate for semantic search.
embeddings = HuggingFaceEmbeddings(
model_name="sentence-transformers/all-MiniLM-L6-v2"
)
# =========================
# Text Splitter (CRITICAL)
# =========================
# WHY:
# LLMs and embeddings work best with small semantic chunks.
# Chunk overlap preserves context between chunks.
text_splitter = RecursiveCharacterTextSplitter(
chunk_size=800, # optimal chunk size for most LLMs
chunk_overlap=150 # prevents loss of context at boundaries
)
# =========================
# Build Vector Store
# =========================
def build_store(
docs: List[Union[str, Document]],
save_path: str = "faiss_store"
) -> FAISS:
"""
Build a FAISS vector store from documents and save locally.
Parameters
----------
docs : list[str] OR list[Document]
Raw text documents or LangChain Document objects.
save_path : str
Directory where FAISS index will be stored.
Returns
-------
FAISS vector store
"""
# ---- Guardrail ----
# Prevent silent failures if empty docs passed
if not docs:
raise ValueError("Document list is empty.")
# ---- Convert strings → Document objects ----
# WHY:
# LangChain stores metadata inside Document objects.
if isinstance(docs[0], str):
docs = [Document(page_content=d) for d in docs]
# ---- Split documents into chunks ----
# WHY:
# Embeddings on large text are noisy and inefficient.
split_docs = text_splitter.split_documents(docs)
print(f"Created {len(split_docs)} text chunks.")
# ---- Create FAISS vector store ----
# This step:
# 1. Generates embeddings
# 2. Builds similarity index
vector_store = FAISS.from_documents(split_docs, embeddings)
# ---- Persist to disk ----
# WHY:
# Without saving, embeddings must be rebuilt every run.
vector_store.save_local(save_path)
print(f"Vector store saved at '{save_path}'")
return vector_store
# =========================
# Load Existing Store
# =========================
def load_store(path: str = "faiss_store") -> FAISS:
"""
Load a previously saved FAISS vector store.
Returns None if no store has been built yet — the researcher
will fall back to web search only in that case.
"""
if not os.path.exists(path):
print(f"[VectorStore] No store found at '{path}' — running without RAG.")
return None
return FAISS.load_local(
path,
embeddings,
allow_dangerous_deserialization=True
)
# =========================
# Retrieval Function
# =========================
def retrieve(query: str, store: FAISS, k: int = 4) -> str:
"""
Retrieve top-k relevant chunks for a query.
Parameters
----------
query : str
User question
store : FAISS
Loaded vector store
k : int
Number of chunks to retrieve
Returns
-------
Formatted string ready for LLM context
"""
# ---- Guardrails ----
if not query or not query.strip():
return "Empty query provided."
# ---- Semantic search ----
docs = store.similarity_search(query, k=k)
if not docs:
return "No relevant documents found."
# ---- Format for LLM ----
# WHY:
# Structured context reduces hallucinations.
results = []
for i, doc in enumerate(docs, 1):
results.append(
f"[Source {i}]\n{doc.page_content}"
)
return "\n\n".join(results)
# =========================
# Example Usage (CLI demo)
# =========================
# Run this file directly to test the pipeline.
if __name__ == "__main__":
sample_docs = [
"LangChain is a framework for building LLM powered apps.",
"FAISS is a vector database developed by Facebook AI.",
"Embeddings convert text into numerical vectors.",
"RAG stands for Retrieval Augmented Generation."
]
print("\n--- Building Vector Store ---")
store = build_store(sample_docs)
print("\n--- Loading Vector Store ---")
store = load_store()
print("\n--- Retrieval Demo ---")
question = "What is FAISS?"
context = retrieve(question, store)
print("\nRetrieved Context:\n")
print(context)