R-help-chat / test_retriever.py
jedick
Rebuild db for R-devel
8e00599
from retriever import BuildRetriever
from main import ProcessCollection
from dotenv import load_dotenv
# Setup environment variables
load_dotenv(dotenv_path=".env", override=True)
# Define email and database directories
# NOTE: Here we add the R-devel collection to the database
# (R-help was already added by the CI running test_main.py before this file)
email_dir = "test_emails/R-devel/"
db_dir = "test_db"
def test_retriever():
# Create the test database
ProcessCollection(email_dir, db_dir)
# Get a dense retriever instance
retriever = BuildRetriever(
db_dir, "R-help", "dense", top_k=1, start_year=2025, end_year=2025
)
# The result is a semantically similar match to the query
results = retriever.invoke("inscrutable")
assert (
"anyone who might know enough to actually do it" in results[0].page_content
or "makes no sense" in results[0].page_content
)
# But we don't get an exact match
assert not "inscrutable" in results[0].page_content
# Try keyword retrieval
retriever = BuildRetriever(
db_dir, "R-help", "sparse", top_k=1, start_year=2025, end_year=2025
)
results = retriever.invoke("inscrutable")
# This time we get an exact match
assert "inscrutable" in results[0].page_content
# R-devel with hybrid search
retriever = BuildRetriever(
db_dir, "R-devel", "hybrid", top_k=2, start_year=2025, end_year=2025
)
results = retriever.invoke("MCMC")
assert "MCMC" in results[0].page_content
# Search by month - sparse
retriever = BuildRetriever(
db_dir,
"R-help",
"sparse",
top_k=6,
start_year=2025,
end_year=2025,
months=["Dec"],
)
results = retriever.invoke("the")
# Check that the source file name for each result contains "December"
assert all(["December" in result.metadata["source"] for result in results])
# Search by month - dense
retriever = BuildRetriever(
db_dir,
"R-help",
"dense",
top_k=6,
start_year=2025,
end_year=2025,
months=["Oct"],
)
results = retriever.invoke("plotting")
assert all(["October" in result.metadata["source"] for result in results])
# In the test database, only one email in October 2025 has the word "plot"
assert "plot" in results[0].page_content