from retriever import BuildRetriever from main import ProcessCollection from dotenv import load_dotenv # Setup environment variables load_dotenv(dotenv_path=".env", override=True) # Define email and database directories # NOTE: Here we add the R-devel collection to the database # (R-help was already added by the CI running test_main.py before this file) email_dir = "test_emails/R-devel/" db_dir = "test_db" def test_retriever(): # Create the test database ProcessCollection(email_dir, db_dir) # Get a dense retriever instance retriever = BuildRetriever( db_dir, "R-help", "dense", top_k=1, start_year=2025, end_year=2025 ) # The result is a semantically similar match to the query results = retriever.invoke("inscrutable") assert ( "anyone who might know enough to actually do it" in results[0].page_content or "makes no sense" in results[0].page_content ) # But we don't get an exact match assert not "inscrutable" in results[0].page_content # Try keyword retrieval retriever = BuildRetriever( db_dir, "R-help", "sparse", top_k=1, start_year=2025, end_year=2025 ) results = retriever.invoke("inscrutable") # This time we get an exact match assert "inscrutable" in results[0].page_content # R-devel with hybrid search retriever = BuildRetriever( db_dir, "R-devel", "hybrid", top_k=2, start_year=2025, end_year=2025 ) results = retriever.invoke("MCMC") assert "MCMC" in results[0].page_content # Search by month - sparse retriever = BuildRetriever( db_dir, "R-help", "sparse", top_k=6, start_year=2025, end_year=2025, months=["Dec"], ) results = retriever.invoke("the") # Check that the source file name for each result contains "December" assert all(["December" in result.metadata["source"] for result in results]) # Search by month - dense retriever = BuildRetriever( db_dir, "R-help", "dense", top_k=6, start_year=2025, end_year=2025, months=["Oct"], ) results = retriever.invoke("plotting") assert all(["October" in result.metadata["source"] for result in results]) # In the test database, only one email in October 2025 has the word "plot" assert "plot" in results[0].page_content