Spaces:
Running
Running
File size: 2,390 Bytes
6020ae0 8e00599 6020ae0 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 | from retriever import BuildRetriever
from main import ProcessCollection
from dotenv import load_dotenv
# Setup environment variables
load_dotenv(dotenv_path=".env", override=True)
# Define email and database directories
# NOTE: Here we add the R-devel collection to the database
# (R-help was already added by the CI running test_main.py before this file)
email_dir = "test_emails/R-devel/"
db_dir = "test_db"
def test_retriever():
# Create the test database
ProcessCollection(email_dir, db_dir)
# Get a dense retriever instance
retriever = BuildRetriever(
db_dir, "R-help", "dense", top_k=1, start_year=2025, end_year=2025
)
# The result is a semantically similar match to the query
results = retriever.invoke("inscrutable")
assert (
"anyone who might know enough to actually do it" in results[0].page_content
or "makes no sense" in results[0].page_content
)
# But we don't get an exact match
assert not "inscrutable" in results[0].page_content
# Try keyword retrieval
retriever = BuildRetriever(
db_dir, "R-help", "sparse", top_k=1, start_year=2025, end_year=2025
)
results = retriever.invoke("inscrutable")
# This time we get an exact match
assert "inscrutable" in results[0].page_content
# R-devel with hybrid search
retriever = BuildRetriever(
db_dir, "R-devel", "hybrid", top_k=2, start_year=2025, end_year=2025
)
results = retriever.invoke("MCMC")
assert "MCMC" in results[0].page_content
# Search by month - sparse
retriever = BuildRetriever(
db_dir,
"R-help",
"sparse",
top_k=6,
start_year=2025,
end_year=2025,
months=["Dec"],
)
results = retriever.invoke("the")
# Check that the source file name for each result contains "December"
assert all(["December" in result.metadata["source"] for result in results])
# Search by month - dense
retriever = BuildRetriever(
db_dir,
"R-help",
"dense",
top_k=6,
start_year=2025,
end_year=2025,
months=["Oct"],
)
results = retriever.invoke("plotting")
assert all(["October" in result.metadata["source"] for result in results])
# In the test database, only one email in October 2025 has the word "plot"
assert "plot" in results[0].page_content
|