Spaces:

jedick
/

R-help-chat

Running

App Files Files Community

jedick commited on Jan 4

Commit

e1365aa

1 Parent(s): cf58ea1

Support collections and filtering by months

Browse files

Files changed (8) hide show

app.py +63 -34
data.py +1 -3
graph.py +14 -10
index.py +23 -11
main.py +27 -13
prompts.py +30 -12
retriever.py +134 -46
util.py +17 -10

app.py CHANGED Viewed

@@ -13,13 +13,15 @@ from util import get_sources, get_start_end_months
 from data import download_data, extract_data
 from main import openai_model
 from graph import BuildGraph
-from retriever import db_dir
 # Set environment variables
 load_dotenv(dotenv_path=".env", override=True)
 # Hide BM25S progress bars
 os.environ["DISABLE_TQDM"] = "true"
 # Download and extract data if data directory is not present
 if not os.path.isdir(db_dir):
     print("Downloading data ... ", end="")
@@ -32,7 +34,8 @@ if not os.path.isdir(db_dir):
 # Global setting for search type
 search_type = "hybrid"
-# Global variables for LangChain graph: use dictionaries to store user-specific instances
 # https://www.gradio.app/guides/state-in-blocks
 graph_instances = {}
@@ -86,7 +89,7 @@ def append_content(chunk_messages, history, thinking_about):
     return history
-def run_workflow(input, history, thread_id, session_hash):
     """The main function to run the chat workflow"""
     # Get graph instance
@@ -97,6 +100,8 @@ def run_workflow(input, history, thread_id, session_hash):
         chat_model = ChatOpenAI(model=openai_model, temperature=0)
         graph_builder = BuildGraph(
             chat_model,
             search_type,
         )
         # Compile the graph with an in-memory checkpointer
@@ -106,7 +111,7 @@ def run_workflow(input, history, thread_id, session_hash):
         graph_instances[session_hash] = graph
         # ISO 8601 timestamp with local timezone information without microsecond
         timestamp = datetime.now().replace(microsecond=0).isoformat()
-        print(f"{timestamp} - Set graph for session {session_hash}")
         ## Notify when model finishes loading
         # gr.Success("Model loaded!", duration=4)
     else:
@@ -148,7 +153,7 @@ def run_workflow(input, history, thread_id, session_hash):
                     if start_year or end_year:
                         content = f"{content} ({start_year or ''} - {end_year or ''})"
                     if "months" in args:
-                        content = f"{content} {args['months']}"
                     history.append(
                         gr.ChatMessage(
                             role="assistant",
@@ -169,12 +174,12 @@ def run_workflow(input, history, thread_id, session_hash):
                 email_list = message.content.replace(
                     "### Retrieved Emails:\n\n", ""
                 ).split("--- --- --- --- Next Email --- --- --- ---\n\n")
-                # Get the list of source files (e.g. R-help/2024-December.txt) for retrieved emails
-                month_list = [text.splitlines()[0] for text in email_list]
                 # Format months (e.g. 2024-December) into text
-                month_text = (
-                    ", ".join(month_list).replace("R-help/", "").replace(".txt", "")
-                )
                 # Get the number of retrieved emails
                 n_emails = len(email_list)
                 title = f"🗎 Retrieved {n_emails} emails"
@@ -219,7 +224,7 @@ def run_workflow(input, history, thread_id, session_hash):
             yield history, None, citations
-def to_workflow(request: gr.Request, *args):
     """Wrapper function to call run_workflow() with session_hash"""
     input = args[0]
     # Add session_hash to arguments
@@ -318,19 +323,20 @@ with gr.Blocks(
             <!-- Get AI-powered answers about R programming backed by email retrieval. -->
             ## 🇷🤝💬 R-help-chat
-            **Search and chat with the [R-help mailing list archives](https://stat.ethz.ch/pipermail/r-help/).**
             An LLM turns your question into a search query, including year ranges and months.
             Retrieved emails are shown below the chatbot and are used by the LLM to generate an answer.
-            You can ask follow-up questions with the chat history as context.
             Press the clear button (🗑) to clear the history and start a new chat.
             *Privacy notice*: Data sharing with OpenAI is enabled.
             """
         return intro
-    def get_info_text():
         try:
             # Get source files for each email and start and end months from database
-            sources = get_sources()
             start, end = get_start_end_months(sources)
         except:
             # If database isn't ready, put in empty values
@@ -339,28 +345,26 @@ with gr.Blocks(
             end = None
         info_text = f"""
             **Database:** {len(sources)} emails from {start} to {end}<br>
-            **Models:** {openai_model} and text-embedding-3-small<br>
             **Features:** RAG, today's date, hybrid search (semantic + lexical), multiple retrievals, citations output, chat memory<br>
             **Tech:** [OpenAI](https://openai.com/), [Chroma](https://www.trychroma.com/),
               [BM25S](https://github.com/xhluca/bm25s), [LangGraph](https://www.langchain.com/langgraph), [Gradio](https://www.langchain.com/langgraph)<br>
             🏠 **More info:** [R-help-chat GitHub repository](https://github.com/jedick/R-help-chat)
             """
         return info_text
-    def get_example_questions(as_dataset=True):
         """Get example questions"""
         questions = [
             # "What is today's date?",
-            "Summarize emails from the most recent two months",
             "Show me code examples using plotmath",
-            "When was has.HLC mentioned?",
-            "Who reported installation problems in 2023-2024?",
         ]
         # cf. https://github.com/gradio-app/gradio/pull/8745 for updating examples
         return gr.Dataset(samples=[[q] for q in questions]) if as_dataset else questions
-    def get_multi_tool_questions(as_dataset=True):
         """Get multi-tool example questions"""
         questions = [
             "Differences between lapply and for loops",
@@ -369,7 +373,7 @@ with gr.Blocks(
         return gr.Dataset(samples=[[q] for q in questions]) if as_dataset else questions
-    def get_multi_turn_questions(as_dataset=True):
         """Get multi-turn example questions"""
         questions = [
             "Lookup emails that reference bugs.r-project.org in 2025",
@@ -378,6 +382,15 @@ with gr.Blocks(
         return gr.Dataset(samples=[[q] for q in questions]) if as_dataset else questions
     with gr.Row():
         # Left column: Intro, Compute, Chat
         with gr.Column(scale=2):
@@ -385,10 +398,10 @@ with gr.Blocks(
                 with gr.Column(scale=4):
                     intro = gr.Markdown(get_intro_text())
                 with gr.Column(scale=1):
-                    gr.Radio(
-                        ["Auto", "R-help", "R-devel", "R-pkg-devel"],
-                        label="🚧 Mailing List (Under construction) 🚧",
-                        interactive=False,
                     )
             with gr.Group() as chat_interface:
                 chatbot.render()
@@ -401,23 +414,28 @@ with gr.Blocks(
         # Right column: Info, Examples
         with gr.Column(scale=1):
             with gr.Accordion("ℹ️ App Info", open=True):
-                info = gr.Markdown(get_info_text())
             with gr.Accordion("💡 Examples", open=True):
                 # Add some helpful examples
                 example_questions = gr.Examples(
-                    examples=get_example_questions(as_dataset=False),
                     inputs=[input],
-                    label="Click an example to fill the message box",
                 )
                 multi_tool_questions = gr.Examples(
-                    examples=get_multi_tool_questions(as_dataset=False),
                     inputs=[input],
                     label="Multiple retrievals",
                 )
                 multi_turn_questions = gr.Examples(
-                    examples=get_multi_turn_questions(as_dataset=False),
                     inputs=[input],
-                    label="Asking follow-up questions",
                 )
     # Bottom row: retrieved emails and citations
@@ -458,10 +476,21 @@ with gr.Blocks(
     # https://github.com/gradio-app/gradio/issues/9722
     chatbot.clear(generate_thread_id, outputs=[thread_id], api_visibility="private")
     input.submit(
         # Submit input to the chatbot
-        to_workflow,
-        [input, chatbot, thread_id],
         [chatbot, retrieved_emails, citations_text],
         api_visibility="private",
     )

 from data import download_data, extract_data
 from main import openai_model
 from graph import BuildGraph
 # Set environment variables
 load_dotenv(dotenv_path=".env", override=True)
 # Hide BM25S progress bars
 os.environ["DISABLE_TQDM"] = "true"
+# Database directory
+db_dir = "db"
 # Download and extract data if data directory is not present
 if not os.path.isdir(db_dir):
     print("Downloading data ... ", end="")
 # Global setting for search type
 search_type = "hybrid"
+# Global variable for LangChain graph
+# Use dictionary to store user-specific instances
 # https://www.gradio.app/guides/state-in-blocks
 graph_instances = {}
     return history
+def run_workflow(input, collection, history, thread_id, session_hash):
     """The main function to run the chat workflow"""
     # Get graph instance
         chat_model = ChatOpenAI(model=openai_model, temperature=0)
         graph_builder = BuildGraph(
             chat_model,
+            db_dir,
+            collection,
             search_type,
         )
         # Compile the graph with an in-memory checkpointer
         graph_instances[session_hash] = graph
         # ISO 8601 timestamp with local timezone information without microsecond
         timestamp = datetime.now().replace(microsecond=0).isoformat()
+        print(f"{timestamp} - Set {collection} graph for session {session_hash}")
         ## Notify when model finishes loading
         # gr.Success("Model loaded!", duration=4)
     else:
                     if start_year or end_year:
                         content = f"{content} ({start_year or ''} - {end_year or ''})"
                     if "months" in args:
+                        content = f"{content} {", ".join(args['months'])}"
                     history.append(
                         gr.ChatMessage(
                             role="assistant",
                 email_list = message.content.replace(
                     "### Retrieved Emails:\n\n", ""
                 ).split("--- --- --- --- Next Email --- --- --- ---\n\n")
+                # Get the source file names (e.g. 2024-December.txt) for retrieved emails
+                month_list = [
+                    os.path.basename(text.splitlines()[0]) for text in email_list
+                ]
                 # Format months (e.g. 2024-December) into text
+                month_text = ", ".join(month_list).replace(".txt", "")
                 # Get the number of retrieved emails
                 n_emails = len(email_list)
                 title = f"🗎 Retrieved {n_emails} emails"
             yield history, None, citations
+def run_workflow_in_session(request: gr.Request, *args):
     """Wrapper function to call run_workflow() with session_hash"""
     input = args[0]
     # Add session_hash to arguments
             <!-- Get AI-powered answers about R programming backed by email retrieval. -->
             ## 🇷🤝💬 R-help-chat
+            **Search and chat with the [R-help](https://stat.ethz.ch/pipermail/r-help/) and [R-devel](https://stat.ethz.ch/pipermail/r-devel/)
+            mailing list archives.**
             An LLM turns your question into a search query, including year ranges and months.
             Retrieved emails are shown below the chatbot and are used by the LLM to generate an answer.
+            You can ask follow-up questions with the chat history as context; changing the mailing list maintains history.
             Press the clear button (🗑) to clear the history and start a new chat.
             *Privacy notice*: Data sharing with OpenAI is enabled.
             """
         return intro
+    def get_info_text(collection):
         try:
             # Get source files for each email and start and end months from database
+            sources = get_sources(db_dir, collection)
             start, end = get_start_end_months(sources)
         except:
             # If database isn't ready, put in empty values
             end = None
         info_text = f"""
             **Database:** {len(sources)} emails from {start} to {end}<br>
             **Features:** RAG, today's date, hybrid search (semantic + lexical), multiple retrievals, citations output, chat memory<br>
             **Tech:** [OpenAI](https://openai.com/), [Chroma](https://www.trychroma.com/),
               [BM25S](https://github.com/xhluca/bm25s), [LangGraph](https://www.langchain.com/langgraph), [Gradio](https://www.langchain.com/langgraph)<br>
+            **Maintainer:** [Jeffrey Dick](mailto:j3ffdick@gmail.com) - feedback welcome!<br>
             🏠 **More info:** [R-help-chat GitHub repository](https://github.com/jedick/R-help-chat)
             """
         return info_text
+    def get_example_questions(as_dataset=False):
         """Get example questions"""
         questions = [
             # "What is today's date?",
             "Show me code examples using plotmath",
+            "Summarize emails from the most recent two months",
         ]
         # cf. https://github.com/gradio-app/gradio/pull/8745 for updating examples
         return gr.Dataset(samples=[[q] for q in questions]) if as_dataset else questions
+    def get_multi_tool_questions(as_dataset=False):
         """Get multi-tool example questions"""
         questions = [
             "Differences between lapply and for loops",
         return gr.Dataset(samples=[[q] for q in questions]) if as_dataset else questions
+    def get_multi_turn_questions(as_dataset=False):
         """Get multi-turn example questions"""
         questions = [
             "Lookup emails that reference bugs.r-project.org in 2025",
         return gr.Dataset(samples=[[q] for q in questions]) if as_dataset else questions
+    def get_month_questions(as_dataset=False):
+        """Get month example questions"""
+        questions = [
+            "Was there any discussion of ggplot2 in Q4 2025?",
+            "How about Q3?",
+        ]
+        return gr.Dataset(samples=[[q] for q in questions]) if as_dataset else questions
     with gr.Row():
         # Left column: Intro, Compute, Chat
         with gr.Column(scale=2):
                 with gr.Column(scale=4):
                     intro = gr.Markdown(get_intro_text())
                 with gr.Column(scale=1):
+                    collection = gr.Radio(
+                        ["R-help", "R-devel"],
+                        value="R-help",
+                        label="Mailing List",
                     )
             with gr.Group() as chat_interface:
                 chatbot.render()
         # Right column: Info, Examples
         with gr.Column(scale=1):
             with gr.Accordion("ℹ️ App Info", open=True):
+                app_info = gr.Markdown(get_info_text(collection.value))
             with gr.Accordion("💡 Examples", open=True):
                 # Add some helpful examples
                 example_questions = gr.Examples(
+                    examples=get_example_questions(),
                     inputs=[input],
+                    label="Basic examples",
                 )
                 multi_tool_questions = gr.Examples(
+                    examples=get_multi_tool_questions(),
                     inputs=[input],
                     label="Multiple retrievals",
                 )
                 multi_turn_questions = gr.Examples(
+                    examples=get_multi_turn_questions(),
+                    inputs=[input],
+                    label="Follow-up questions",
+                )
+                month_questions = gr.Examples(
+                    examples=get_month_questions(),
                     inputs=[input],
+                    label="Three-month periods",
                 )
     # Bottom row: retrieved emails and citations
     # https://github.com/gradio-app/gradio/issues/9722
     chatbot.clear(generate_thread_id, outputs=[thread_id], api_visibility="private")
+    collection.change(
+        # We need to build a new graph if the collection changes
+        cleanup_graph
+    ).then(
+        # Update the database stats in the app info box
+        get_info_text,
+        [collection],
+        [app_info],
+        api_name=False,
+    )
     input.submit(
         # Submit input to the chatbot
+        run_workflow_in_session,
+        [input, collection, chatbot, thread_id],
         [chatbot, retrieved_emails, citations_text],
         api_visibility="private",
     )

data.py CHANGED Viewed

@@ -45,9 +45,7 @@ def download_data():
     if not os.path.exists("db.zip"):
         # For S3 (need AWS_ACCESS_KEY_ID and AWS_ACCESS_KEY_SECRET)
-        # db_20250801.zip: chromadb==1.0.13
-        # db_20250801a.zip: chromadb==0.6.3
-        download_file_from_bucket("r-help-chat", "db_20260102.zip", "db.zip")
         ## For Dropbox (shared file - key is in URL)
         # shared_link = "https://www.dropbox.com/scl/fi/jx90g5lorpgkkyyzeurtc/db.zip?rlkey=wvqa3p9hdy4rmod1r8yf2am09&st=l9tsam56&dl=0"
         # output_filename = "db.zip"

     if not os.path.exists("db.zip"):
         # For S3 (need AWS_ACCESS_KEY_ID and AWS_ACCESS_KEY_SECRET)
+        download_file_from_bucket("r-help-chat", "db_20260104.zip", "db.zip")
         ## For Dropbox (shared file - key is in URL)
         # shared_link = "https://www.dropbox.com/scl/fi/jx90g5lorpgkkyyzeurtc/db.zip?rlkey=wvqa3p9hdy4rmod1r8yf2am09&st=l9tsam56&dl=0"
         # output_filename = "db.zip"

graph.py CHANGED Viewed

@@ -17,7 +17,9 @@ from prompts import query_prompt, answer_prompt
 def BuildGraph(
     chat_model,
-    search_type,
     top_k=6,
 ):
     """
@@ -25,7 +27,9 @@ def BuildGraph(
     Args:
         chat_model: LangChain chat model
-        search_type: dense, sparse, or hybrid (for retriever)
         top_k: number of documents to retrieve
     Based on:
@@ -64,7 +68,7 @@ def BuildGraph(
         search_query: str,
         start_year: Optional[int] = None,
         end_year: Optional[int] = None,
-        months: Optional[str] = None,
     ) -> str:
         """
         Retrieve emails related to a search query from the R-help mailing list archives.
@@ -75,23 +79,23 @@ def BuildGraph(
             search_query (str): Search query
             start_year (int, optional): Starting year for emails
             end_year (int, optional): Ending year for emails
-            months (str, optional): One or more months separated by spaces
         """
         retriever = BuildRetriever(
             search_type,
             top_k,
             start_year,
             end_year,
         )
-        # For now, just add the months to the search query
-        if months:
-            search_query = " ".join([search_query, months])
         # If the search query is empty, use the years
         if not search_query:
             search_query = " ".join([search_query, start_year, end_year])
         retrieved_docs = retriever.invoke(search_query)
         serialized = "\n\n--- --- --- --- Next Email --- --- --- ---".join(
-            # Add file name (e.g. R-help/2024-December.txt) from source key
             "\n\n" + doc.metadata["source"] + doc.page_content
             for doc in retrieved_docs
         )
@@ -122,14 +126,14 @@ def BuildGraph(
     def query(state: MessagesState):
         """Queries the retriever with the chat model"""
-        messages = [SystemMessage(query_prompt())] + state["messages"]
         response = query_model.invoke(messages)
         return {"messages": response}
     def answer(state: MessagesState):
         """Generates an answer with the chat model"""
-        messages = [SystemMessage(answer_prompt())] + state["messages"]
         response = answer_model.invoke(messages)
         return {"messages": response}

 def BuildGraph(
     chat_model,
+    db_dir,
+    collection,
+    search_type="hybrid",
     top_k=6,
 ):
     """
     Args:
         chat_model: LangChain chat model
+        db_dir: Database directory
+        collection: Email collection
+        search_type: dense, sparse, or hybrid
         top_k: number of documents to retrieve
     Based on:
         search_query: str,
         start_year: Optional[int] = None,
         end_year: Optional[int] = None,
+        months: Optional[list[str]] = None,
     ) -> str:
         """
         Retrieve emails related to a search query from the R-help mailing list archives.
             search_query (str): Search query
             start_year (int, optional): Starting year for emails
             end_year (int, optional): Ending year for emails
+            months (list(str), optional): List of one or more months (three-letter abbreviations)
         """
         retriever = BuildRetriever(
+            db_dir,
+            collection,
             search_type,
             top_k,
             start_year,
             end_year,
+            months,
         )
         # If the search query is empty, use the years
         if not search_query:
             search_query = " ".join([search_query, start_year, end_year])
         retrieved_docs = retriever.invoke(search_query)
         serialized = "\n\n--- --- --- --- Next Email --- --- --- ---".join(
+            # Add source file name (e.g. R-help/2024-December.txt) from source key
             "\n\n" + doc.metadata["source"] + doc.page_content
             for doc in retrieved_docs
         )
     def query(state: MessagesState):
         """Queries the retriever with the chat model"""
+        messages = [SystemMessage(query_prompt(db_dir, collection))] + state["messages"]
         response = query_model.invoke(messages)
         return {"messages": response}
     def answer(state: MessagesState):
         """Generates an answer with the chat model"""
+        messages = [SystemMessage(answer_prompt(collection))] + state["messages"]
         response = answer_model.invoke(messages)
         return {"messages": response}

index.py CHANGED Viewed

@@ -3,18 +3,21 @@ from langchain_community.document_loaders import TextLoader
 from datetime import datetime
 import tempfile
 import os
 # Local modules
-from retriever import BuildRetriever, db_dir
 from mods.bm25s_retriever import BM25SRetriever
-def ProcessFile(file_path, search_type: str = "dense"):
     """
     Wrapper function to process file for dense or sparse search
     Args:
         file_path: File to process
         search_type: Type of search to use. Options: "dense", "sparse"
     """
@@ -65,10 +68,10 @@ def ProcessFile(file_path, search_type: str = "dense"):
     try:
         if search_type == "sparse":
             # Handle sparse search with BM25
-            ProcessFileSparse(truncated_temp_file, file_path)
         elif search_type == "dense":
             # Handle dense search with ChromaDB
-            ProcessFileDense(truncated_temp_file, file_path)
         else:
             raise ValueError(f"Unsupported search type: {search_type}")
     finally:
@@ -80,17 +83,25 @@ def ProcessFile(file_path, search_type: str = "dense"):
             pass
-def ProcessFileDense(cleaned_temp_file, file_path):
     """
     Process file for dense vector search using ChromaDB
     """
     # Get a retriever instance
-    retriever = BuildRetriever("dense")
     # Load cleaned text file
     loader = TextLoader(cleaned_temp_file)
     documents = loader.load()
     # Use original file path for "source" key in metadata
     documents[0].metadata["source"] = file_path
     # Add file timestamp to metadata
     mod_time = os.path.getmtime(file_path)
     timestamp = datetime.fromtimestamp(mod_time).isoformat()
@@ -113,7 +124,7 @@ def ProcessFileDense(cleaned_temp_file, file_path):
         retriever.add_documents(documents_batch)
-def ProcessFileSparse(cleaned_temp_file, file_path):
     """
     Process file for sparse search using BM25
     """
@@ -126,18 +137,19 @@ def ProcessFileSparse(cleaned_temp_file, file_path):
     splitter = RecursiveCharacterTextSplitter(
         separators=["\n\n\nFrom"], chunk_size=1, chunk_overlap=0
     )
-    ## Using 'EmailFrom' as the separator (requires preprocesing)
-    # splitter = RecursiveCharacterTextSplitter(separators=["EmailFrom"])
     emails = splitter.split_documents(documents)
-    # Use original file path for "source" key in metadata
     for email in emails:
         email.metadata["source"] = file_path
     # Create or update BM25 index
     try:
         # Update BM25 index if it exists
-        bm25_persist_directory = f"{db_dir}/bm25"
         retriever = BM25SRetriever.from_persisted_directory(bm25_persist_directory)
         # Get new emails - ones which have not been indexed
         new_emails = [email for email in emails if email not in retriever.docs]

 from datetime import datetime
 import tempfile
 import os
+import re
 # Local modules
+from retriever import BuildRetriever
 from mods.bm25s_retriever import BM25SRetriever
+def ProcessFile(file_path, db_dir, collection, search_type):
     """
     Wrapper function to process file for dense or sparse search
     Args:
         file_path: File to process
+        db_dir: Database directory
+        collection: Email collection
         search_type: Type of search to use. Options: "dense", "sparse"
     """
     try:
         if search_type == "sparse":
             # Handle sparse search with BM25
+            ProcessFileSparse(truncated_temp_file, file_path, db_dir, collection)
         elif search_type == "dense":
             # Handle dense search with ChromaDB
+            ProcessFileDense(truncated_temp_file, file_path, db_dir, collection)
         else:
             raise ValueError(f"Unsupported search type: {search_type}")
     finally:
             pass
+def ProcessFileDense(cleaned_temp_file, file_path, db_dir, collection):
     """
     Process file for dense vector search using ChromaDB
     """
     # Get a retriever instance
+    retriever = BuildRetriever(db_dir, collection, "dense")
     # Load cleaned text file
     loader = TextLoader(cleaned_temp_file)
     documents = loader.load()
     # Use original file path for "source" key in metadata
     documents[0].metadata["source"] = file_path
+    # Add year and month to metadata
+    filename = os.path.basename(file_path)
+    pattern = re.compile(r"(\d{4})-([A-Za-z]+)\.txt")
+    match = pattern.match(filename)
+    year = int(match.group(1))
+    month = match.group(2)
+    documents[0].metadata["year"] = year
+    documents[0].metadata["month"] = month
     # Add file timestamp to metadata
     mod_time = os.path.getmtime(file_path)
     timestamp = datetime.fromtimestamp(mod_time).isoformat()
         retriever.add_documents(documents_batch)
+def ProcessFileSparse(cleaned_temp_file, file_path, db_dir, collection):
     """
     Process file for sparse search using BM25
     """
     splitter = RecursiveCharacterTextSplitter(
         separators=["\n\n\nFrom"], chunk_size=1, chunk_overlap=0
     )
     emails = splitter.split_documents(documents)
+    # Add metadata keys
     for email in emails:
+        # Original file path, e.g. "R-help/2025-December.txt"
         email.metadata["source"] = file_path
+        # Collection name, e.g. "R-help"
+        email.metadata["collection"] = collection
     # Create or update BM25 index
     try:
         # Update BM25 index if it exists
+        bm25_persist_directory = os.path.join(db_dir, collection, "bm25")
         retriever = BM25SRetriever.from_persisted_directory(bm25_persist_directory)
         # Get new emails - ones which have not been indexed
         new_emails = [email for email in emails if email not in retriever.docs]

main.py CHANGED Viewed

@@ -13,7 +13,7 @@ import ast
 import os
 # Local modules
-from retriever import BuildRetriever, db_dir
 from prompts import answer_prompt
 from index import ProcessFile
 from graph import BuildGraph
@@ -38,29 +38,33 @@ httpx_logger = logging.getLogger("httpx")
 httpx_logger.setLevel(logging.WARNING)
-def ProcessDirectory(path):
     """
     Update vector store and sparse index for files in a directory, only adding new or updated files
     Args:
-        path: Directory to process
     Usage example:
-        ProcessDirectory("R-help")
     """
     # TODO: use UUID to process only changed documents
     # https://stackoverflow.com/questions/76265631/chromadb-add-single-document-only-if-it-doesnt-exist
     # Get a dense retriever instance
-    retriever = BuildRetriever("dense")
     # List all text files in target directory
-    file_paths = glob.glob(f"{path}/*.txt")
     for file_path in file_paths:
         # Process file for sparse search (BM25S)
-        ProcessFile(file_path, "sparse")
         # Logic for dense search: skip file if already indexed
         # Look for existing embeddings for this file
@@ -90,7 +94,7 @@ def ProcessDirectory(path):
                 update_file = True
         if add_file:
-            ProcessFile(file_path, "dense")
         if update_file:
             print(f"Chroma: updated embeddings for {file_path}")
@@ -101,7 +105,7 @@ def ProcessDirectory(path):
             ]
             files_to_keep = list(set(used_doc_ids))
             # Get all files in the file store
-            file_store = f"{db_dir}/file_store"
             all_files = os.listdir(file_store)
             # Iterate through the files and delete those not in the list
             for file in all_files:
@@ -115,7 +119,9 @@ def ProcessDirectory(path):
 def RunChain(
-    query,
     search_type: str = "hybrid",
 ):
     """
@@ -123,14 +129,16 @@ def RunChain(
     Args:
         query: User's query
         search_type: Type of search to use. Options: "dense", "sparse", or "hybrid"
     Example:
-        RunChain("What R functions are discussed?")
     """
     # Get retriever instance
-    retriever = BuildRetriever(search_type)
     if retriever is None:
         return "No retriever available. Please process some documents first."
@@ -139,7 +147,7 @@ def RunChain(
     chat_model = ChatOpenAI(model=openai_model, temperature=0)
     # Get system prompt
-    system_prompt = answer_prompt()
     # Create a prompt template
     system_template = ChatPromptTemplate.from_messages([SystemMessage(system_prompt)])
@@ -170,6 +178,8 @@ def RunChain(
 def RunGraph(
     query: str,
     search_type: str = "hybrid",
     top_k: int = 6,
     thread_id=None,
@@ -178,6 +188,8 @@ def RunGraph(
     Args:
         query: User query to start the chat
         search_type: Type of search to use. Options: "dense", "sparse", or "hybrid"
         top_k: Number of documents to retrieve
         thread_id: Thread ID for memory (optional)
@@ -191,6 +203,8 @@ def RunGraph(
     # Build the graph
     graph_builder = BuildGraph(
         chat_model,
         search_type,
         top_k,
     )

 import os
 # Local modules
+from retriever import BuildRetriever
 from prompts import answer_prompt
 from index import ProcessFile
 from graph import BuildGraph
 httpx_logger.setLevel(logging.WARNING)
+def ProcessCollection(email_dir, db_dir):
     """
     Update vector store and sparse index for files in a directory, only adding new or updated files
     Args:
+        email_dir: Email directory to process
+        db_dir: Database directory
     Usage example:
+        ProcessCollection("R-help", "db")
     """
     # TODO: use UUID to process only changed documents
     # https://stackoverflow.com/questions/76265631/chromadb-add-single-document-only-if-it-doesnt-exist
+    # Get last part of path
+    # https://stackoverflow.com/questions/3925096/how-to-get-only-the-last-part-of-a-path-in-python
+    collection = os.path.basename(os.path.normpath(email_dir))
     # Get a dense retriever instance
+    retriever = BuildRetriever(db_dir, collection, "dense")
     # List all text files in target directory
+    file_paths = glob.glob(f"{email_dir}/*.txt")
     for file_path in file_paths:
         # Process file for sparse search (BM25S)
+        ProcessFile(file_path, db_dir, collection, "sparse")
         # Logic for dense search: skip file if already indexed
         # Look for existing embeddings for this file
                 update_file = True
         if add_file:
+            ProcessFile(file_path, db_dir, collection, "dense")
         if update_file:
             print(f"Chroma: updated embeddings for {file_path}")
             ]
             files_to_keep = list(set(used_doc_ids))
             # Get all files in the file store
+            file_store = os.path.join(db_dir, collection, "file_store")
             all_files = os.listdir(file_store)
             # Iterate through the files and delete those not in the list
             for file in all_files:
 def RunChain(
+    query: str,
+    db_dir: str,
+    collection: str,
     search_type: str = "hybrid",
 ):
     """
     Args:
         query: User's query
+        db_dir: Database directory
+        collection: Email collection
         search_type: Type of search to use. Options: "dense", "sparse", or "hybrid"
     Example:
+        RunChain("What R functions are discussed?", "db", "R-help")
     """
     # Get retriever instance
+    retriever = BuildRetriever(db_dir, collection, search_type)
     if retriever is None:
         return "No retriever available. Please process some documents first."
     chat_model = ChatOpenAI(model=openai_model, temperature=0)
     # Get system prompt
+    system_prompt = answer_prompt(collection)
     # Create a prompt template
     system_template = ChatPromptTemplate.from_messages([SystemMessage(system_prompt)])
 def RunGraph(
     query: str,
+    db_dir: str,
+    collection: str,
     search_type: str = "hybrid",
     top_k: int = 6,
     thread_id=None,
     Args:
         query: User query to start the chat
+        db_dir: Database directory
+        collection: Email collection
         search_type: Type of search to use. Options: "dense", "sparse", or "hybrid"
         top_k: Number of documents to retrieve
         thread_id: Thread ID for memory (optional)
     # Build the graph
     graph_builder = BuildGraph(
         chat_model,
+        db_dir,
+        collection,
         search_type,
         top_k,
     )

prompts.py CHANGED Viewed

@@ -12,15 +12,28 @@ def check_prompt(prompt):
     return prompt
-def query_prompt():
-    """Return system prompt for query step"""
     # Get start and end months from database
-    start, end = get_start_end_months(get_sources())
     prompt = (
         f"Today Date: {date.today()}. "
-        "You are a helpful assistant designed to get information about R programming from the R-help mailing list archives. "
         "Write a search query to retrieve emails relevant to the user's question. "
         "Do not answer the user's question and do not ask the user for more information. "
         # gpt-4o-mini thinks last two months aren't available with this: "Emails from from {start} to {end} are available for retrieval. "
@@ -29,31 +42,36 @@ def query_prompt():
         "Always use retrieve_emails with a non-empty query string for search_query. "
         "For general summaries, use retrieve_emails(search_query='R'). "
         "For questions about years, use retrieve_emails(search_query=<query>, start_year=, end_year=). "
-        "For questions about months, use 3-letter abbreviations (Jan...Dec) for the 'month' argument. "
         "Use all previous messages as context to formulate your search query. "  # Gemma
         "You should always retrieve more emails based on context and the most recent question. "  # Qwen
-        # "Even if retrieved emails are available, you should retrieve more emails to answer the most recent question. "  # Qwen
-        # "You must perform the search yourself. Do not tell the user how to retrieve emails. "  # Qwen
-        # "Do not use your memory or knowledge to answer the user's question. Only retrieve emails based on the user's question. "  # Qwen
-        # "If you decide not to retrieve emails, tell the user why and suggest how to improve their question to chat with the R-help mailing list. "
     )
     prompt = check_prompt(prompt)
     return prompt
-def answer_prompt():
     """Return system prompt for answer step"""
     prompt = (
         f"Today Date: {date.today()}. "
-        "You are a helpful chatbot designed to answer questions about R programming based on the R-help mailing list archives. "
         "Summarize the retrieved emails to answer the user's question or query. "
         "If any of the retrieved emails are irrelevant (e.g. wrong dates), then do not use them. "
         "Tell the user if there are no retrieved emails or if you are unable to answer the question based on the information in the emails. "
         "Do not give an answer based on your own knowledge or memory, and do not include examples that aren't based on the retrieved emails. "
         "Example: For a question about using lm(), take examples of lm() from the retrieved emails to answer the user's question. "
         # "Do not respond with packages that are only listed under sessionInfo, session info, or other attached packages. "
-        "Summarize the content of the emails rather than copying the headers. "  # Qwen
         "You must include inline citations (email senders and dates) in each part of your response. "
         "Only answer general questions about R if the answer is in the retrieved emails. "
         "Only include URLs if they were used by human authors (not in email headers), and do not modify any URLs. "  # Qwen, Gemma

     return prompt
+def query_prompt(db_dir, collection):
+    """
+    Return system prompt for query step
+    Args:
+        db_dir: Database directory
+        collection: Email collection
+    """
     # Get start and end months from database
+    start, end = get_start_end_months(get_sources(db_dir, collection))
+    # Use appropriate list topic
+    if collection == "R-help":
+        topic = "R programming"
+    elif collection == "R-devel":
+        topic = "R development"
+    elif collection == "R-package-devel":
+        topic = "R package development"
     prompt = (
         f"Today Date: {date.today()}. "
+        f"You are a search assistant for retrieving information about {topic} from the {collection} mailing list archives. "
         "Write a search query to retrieve emails relevant to the user's question. "
         "Do not answer the user's question and do not ask the user for more information. "
         # gpt-4o-mini thinks last two months aren't available with this: "Emails from from {start} to {end} are available for retrieval. "
         "Always use retrieve_emails with a non-empty query string for search_query. "
         "For general summaries, use retrieve_emails(search_query='R'). "
         "For questions about years, use retrieve_emails(search_query=<query>, start_year=, end_year=). "
+        "For questions about months, use 3-letter abbreviations (Jan...Dec) for the 'months' argument. "
         "Use all previous messages as context to formulate your search query. "  # Gemma
         "You should always retrieve more emails based on context and the most recent question. "  # Qwen
+        f"If you decide not to retrieve emails, tell the user how to improve their question to search the {collection} mailing list. "
     )
     prompt = check_prompt(prompt)
     return prompt
+def answer_prompt(collection):
     """Return system prompt for answer step"""
+    # Use appropriate list topic
+    if collection == "R-help":
+        topic = "R programming"
+    elif collection == "R-devel":
+        topic = "R development"
+    elif collection == "R-package-devel":
+        topic = "R package development"
     prompt = (
         f"Today Date: {date.today()}. "
+        f"You are a helpful chatbot that can answer questions about {topic} based on the {collection} mailing list archives. "
         "Summarize the retrieved emails to answer the user's question or query. "
         "If any of the retrieved emails are irrelevant (e.g. wrong dates), then do not use them. "
         "Tell the user if there are no retrieved emails or if you are unable to answer the question based on the information in the emails. "
         "Do not give an answer based on your own knowledge or memory, and do not include examples that aren't based on the retrieved emails. "
         "Example: For a question about using lm(), take examples of lm() from the retrieved emails to answer the user's question. "
         # "Do not respond with packages that are only listed under sessionInfo, session info, or other attached packages. "
         "You must include inline citations (email senders and dates) in each part of your response. "
         "Only answer general questions about R if the answer is in the retrieved emails. "
         "Only include URLs if they were used by human authors (not in email headers), and do not modify any URLs. "  # Qwen, Gemma

retriever.py CHANGED Viewed

@@ -11,54 +11,69 @@ from typing import Any, Optional
 import chromadb
 import os
 import re
 # Local modules
 from mods.bm25s_retriever import BM25SRetriever
 from mods.file_system import LocalFileStore
-# Database directory
-db_dir = "db"
 def BuildRetriever(
-    search_type: str = "hybrid",
-    top_k=6,
-    start_year=None,
-    end_year=None,
 ):
     """
     Build retriever instance.
     All retriever types are configured to return up to 6 documents for fair comparison in evals.
     Args:
         search_type: Type of search to use. Options: "dense", "sparse", "hybrid"
         top_k: Number of documents to retrieve for "dense" and "sparse"
         start_year: Start year (optional)
         end_year: End year (optional)
     """
     if search_type == "dense":
-        if not (start_year or end_year):
-            # No year filtering, so directly use base retriever
-            return BuildRetrieverDense(top_k=top_k)
         else:
-            # Get 1000 documents then keep top_k filtered by year
-            base_retriever = BuildRetrieverDense(top_k=1000)
             return TopKRetriever(
                 base_retriever=base_retriever,
                 top_k=top_k,
                 start_year=start_year,
                 end_year=end_year,
             )
     if search_type == "sparse":
-        if not (start_year or end_year):
-            return BuildRetrieverSparse(top_k=top_k)
         else:
-            base_retriever = BuildRetrieverSparse(top_k=1000)
             return TopKRetriever(
                 base_retriever=base_retriever,
                 top_k=top_k,
                 start_year=start_year,
                 end_year=end_year,
             )
     elif search_type == "hybrid":
         # Hybrid search (dense + sparse) - use ensemble method
@@ -66,16 +81,22 @@ def BuildRetriever(
         # Use floor (top_k // 2) and ceiling -(top_k // -2) to divide odd values of top_k
         # https://stackoverflow.com/questions/14822184/is-there-a-ceiling-equivalent-of-operator-in-python
         dense_retriever = BuildRetriever(
             "dense",
             (top_k // 2),
             start_year,
             end_year,
         )
         sparse_retriever = BuildRetriever(
             "sparse",
             -(top_k // -2),
             start_year,
             end_year,
         )
         ensemble_retriever = EnsembleRetriever(
             retrievers=[dense_retriever, sparse_retriever], weights=[1, 1]
@@ -85,31 +106,38 @@ def BuildRetriever(
         raise ValueError(f"Unsupported search type: {search_type}")
-def BuildRetrieverSparse(top_k=6):
     """
     Build sparse retriever instance
     Args:
         top_k: Number of documents to retrieve
     """
     # BM25 persistent directory
-    bm25_persist_directory = f"{db_dir}/bm25"
     if not os.path.exists(bm25_persist_directory):
         os.makedirs(bm25_persist_directory)
     # Use BM25 sparse search
     retriever = BM25SRetriever.from_persisted_directory(
         path=bm25_persist_directory,
-        k=top_k,
     )
     return retriever
-def BuildRetrieverDense(top_k=6):
     """
     Build dense retriever instance with ChromaDB vectorstore
     Args:
         top_k: Number of documents to retrieve
     """
@@ -117,15 +145,15 @@ def BuildRetrieverDense(top_k=6):
     embedding_function = OpenAIEmbeddings(model="text-embedding-3-small")
     # Create vector store
     client_settings = chromadb.config.Settings(anonymized_telemetry=False)
-    persist_directory = f"{db_dir}/chroma"
     vectorstore = Chroma(
-        collection_name="R-help",
         embedding_function=embedding_function,
         client_settings=client_settings,
         persist_directory=persist_directory,
     )
     # The storage layer for the parent documents
-    file_store = f"{db_dir}/file_store"
     byte_store = LocalFileStore(file_store)
     # Text splitter for child documents
     child_splitter = RecursiveCharacterTextSplitter(
@@ -152,18 +180,21 @@ def BuildRetrieverDense(top_k=6):
 class TopKRetriever(BaseRetriever):
-    """Retriever that wraps a base retriever and returns the top k documents, optionally matching given start and/or end years."""
-    # Code adapted from langchain/retrievers/contextual_compression.py
     base_retriever: RetrieverLike
-    """Base Retriever to use for getting relevant documents."""
     top_k: int = 6
-    """Number of documents to return."""
     start_year: Optional[int] = None
     end_year: Optional[int] = None
     def _get_relevant_documents(
         self,
@@ -172,7 +203,8 @@ class TopKRetriever(BaseRetriever):
         run_manager: CallbackManagerForRetrieverRun,
         **kwargs: Any,
     ) -> list[Document]:
-        """Return the top k documents within start and end years if given.
         Returns:
             Sequence of documents
@@ -183,28 +215,84 @@ class TopKRetriever(BaseRetriever):
         )
         if retrieved_docs:
-            # Get the sources (file names) and years
             sources = [doc.metadata["source"] for doc in filtered_docs]
-            years = [
-                re.sub(r"-[A-Za-z]+\.txt", "", source.replace("R-help/", ""))
-                for source in sources
-            ]
-            # Convert years to integer
-            years = [int(year) for year in years]
-            # Filtering by year
-            if self.start_year:
-                in_range = after_start = [year >= self.start_year for year in years]
-            if self.end_year:
-                in_range = before_end = [year <= self.end_year for year in years]
-            if self.start_year and self.end_year:
-                in_range = [
-                    after and before for after, before in zip(after_start, before_end)
                 ]
             if self.start_year or self.end_year:
-                # Extract docs where the year is in the start-end range
                 filtered_docs = [
-                    doc for doc, in_range in zip(retrieved_docs, in_range) if in_range
                 ]
             # Return the top k docs

 import chromadb
 import os
 import re
+from calendar import month_abbr, month_name
 # Local modules
 from mods.bm25s_retriever import BM25SRetriever
 from mods.file_system import LocalFileStore
+from util import get_sources
 def BuildRetriever(
+    db_dir: str,
+    collection: str,
+    search_type: str,
+    top_k: Optional[int] = 6,
+    start_year: Optional[int] = None,
+    end_year: Optional[int] = None,
+    months: Optional[list[str]] = None,
 ):
     """
     Build retriever instance.
     All retriever types are configured to return up to 6 documents for fair comparison in evals.
     Args:
+        db_dir: Database directory
+        collection: Email collection
         search_type: Type of search to use. Options: "dense", "sparse", "hybrid"
         top_k: Number of documents to retrieve for "dense" and "sparse"
         start_year: Start year (optional)
         end_year: End year (optional)
+        months: List of months (3-letter abbreviations) (optional)
     """
     if search_type == "dense":
+        if not (start_year or end_year or months):
+            # No year or month filtering, so directly use base retriever
+            return BuildRetrieverDense(
+                db_dir=db_dir, collection=collection, top_k=top_k
+            )
         else:
+            # Get 10000 documents then keep top_k filtered by year and month
+            base_retriever = BuildRetrieverDense(
+                db_dir=db_dir, collection=collection, top_k=10000
+            )
             return TopKRetriever(
                 base_retriever=base_retriever,
                 top_k=top_k,
                 start_year=start_year,
                 end_year=end_year,
+                months=months,
             )
     if search_type == "sparse":
+        if not (start_year or end_year or months):
+            return BuildRetrieverSparse(
+                db_dir=db_dir, collection=collection, top_k=top_k
+            )
         else:
+            base_retriever = BuildRetrieverSparse(
+                db_dir=db_dir, collection=collection, top_k=10000
+            )
             return TopKRetriever(
                 base_retriever=base_retriever,
                 top_k=top_k,
                 start_year=start_year,
                 end_year=end_year,
+                months=months,
             )
     elif search_type == "hybrid":
         # Hybrid search (dense + sparse) - use ensemble method
         # Use floor (top_k // 2) and ceiling -(top_k // -2) to divide odd values of top_k
         # https://stackoverflow.com/questions/14822184/is-there-a-ceiling-equivalent-of-operator-in-python
         dense_retriever = BuildRetriever(
+            db_dir,
+            collection,
             "dense",
             (top_k // 2),
             start_year,
             end_year,
+            months,
         )
         sparse_retriever = BuildRetriever(
+            db_dir,
+            collection,
             "sparse",
             -(top_k // -2),
             start_year,
             end_year,
+            months,
         )
         ensemble_retriever = EnsembleRetriever(
             retrievers=[dense_retriever, sparse_retriever], weights=[1, 1]
         raise ValueError(f"Unsupported search type: {search_type}")
+def BuildRetrieverSparse(db_dir, collection, top_k=6):
     """
     Build sparse retriever instance
     Args:
+        db_dir: Database directory
+        collection: Email collection
         top_k: Number of documents to retrieve
     """
     # BM25 persistent directory
+    bm25_persist_directory = os.path.join(db_dir, collection, "bm25")
     if not os.path.exists(bm25_persist_directory):
         os.makedirs(bm25_persist_directory)
     # Use BM25 sparse search
+    # top_k can't be larger than the corpus size (number of emails)
+    corpus_size = len(get_sources(db_dir, collection))
+    k = top_k if top_k < corpus_size else corpus_size
     retriever = BM25SRetriever.from_persisted_directory(
         path=bm25_persist_directory,
+        k=k,
     )
     return retriever
+def BuildRetrieverDense(db_dir, collection, top_k=6):
     """
     Build dense retriever instance with ChromaDB vectorstore
     Args:
+        db_dir: Database directory
+        collection: Email collection
         top_k: Number of documents to retrieve
     """
     embedding_function = OpenAIEmbeddings(model="text-embedding-3-small")
     # Create vector store
     client_settings = chromadb.config.Settings(anonymized_telemetry=False)
+    persist_directory = os.path.join(db_dir, collection, "chroma")
     vectorstore = Chroma(
+        collection_name=collection,
         embedding_function=embedding_function,
         client_settings=client_settings,
         persist_directory=persist_directory,
     )
     # The storage layer for the parent documents
+    file_store = os.path.join(db_dir, collection, "file_store")
     byte_store = LocalFileStore(file_store)
     # Text splitter for child documents
     child_splitter = RecursiveCharacterTextSplitter(
 class TopKRetriever(BaseRetriever):
+    """
+    Retriever that wraps a base retriever and returns the top k documents,
+    optionally matching given start and/or end years.
+    Code adapted from langchain/retrievers/contextual_compression.py
+    """
+    # Base Retriever to use for getting relevant documents
     base_retriever: RetrieverLike
+    # Number of documents to return
     top_k: int = 6
+    # Optional year and month arguments
     start_year: Optional[int] = None
     end_year: Optional[int] = None
+    months: Optional[list[str]] = None
     def _get_relevant_documents(
         self,
         run_manager: CallbackManagerForRetrieverRun,
         **kwargs: Any,
     ) -> list[Document]:
+        """
+        Return the top k documents within start and end years (and months) if given.
         Returns:
             Sequence of documents
         )
         if retrieved_docs:
+            # Get the email source files and basenames
             sources = [doc.metadata["source"] for doc in filtered_docs]
+            filenames = [os.path.basename(source) for source in sources]
+            # Get the years and months
+            pattern = re.compile(r"(\d{4})-([A-Za-z]+)\.txt")
+            matches = [pattern.match(filename) for filename in filenames]
+            # Extract years and month names, handling None matches
+            years = []
+            month_names = []
+            for match in matches:
+                if match:
+                    years.append(int(match.group(1)))
+                    month_names.append(match.group(2))
+                else:
+                    years.append(None)
+                    month_names.append(None)
+            # Create mapping from 3-letter abbreviations to full month names
+            # month_abbr[0] is empty string, month_abbr[1] is "Jan", etc.
+            # month_name[0] is empty string, month_name[1] is "January", etc.
+            abbr_to_full = {month_abbr[i].lower(): month_name[i] for i in range(1, 13)}
+            # Convert months list (3-letter abbreviations) to full month names
+            target_months = None
+            if self.months:
+                target_months = [
+                    abbr_to_full.get(month.lower()) for month in self.months
                 ]
+                # Filter out None values in case of invalid abbreviations
+                target_months = [m for m in target_months if m is not None]
+            # Initialize filter flags
+            year_filter = None
+            month_filter = None
+            # Filtering by year
             if self.start_year or self.end_year:
+                if self.start_year and self.end_year:
+                    year_filter = [
+                        year is not None
+                        and year >= self.start_year
+                        and year <= self.end_year
+                        for year in years
+                    ]
+                elif self.start_year:
+                    year_filter = [
+                        year is not None and year >= self.start_year for year in years
+                    ]
+                elif self.end_year:
+                    year_filter = [
+                        year is not None and year <= self.end_year for year in years
+                    ]
+            # Filtering by month
+            if target_months:
+                month_filter = [
+                    month_name is not None and month_name in target_months
+                    for month_name in month_names
+                ]
+            # Combine filters
+            if year_filter is not None and month_filter is not None:
+                # Both year and month filters
+                combined_filter = [
+                    year and month for year, month in zip(year_filter, month_filter)
+                ]
+                filtered_docs = [
+                    doc for doc, keep in zip(retrieved_docs, combined_filter) if keep
+                ]
+            elif year_filter is not None:
+                # Only year filter
+                filtered_docs = [
+                    doc for doc, keep in zip(retrieved_docs, year_filter) if keep
+                ]
+            elif month_filter is not None:
+                # Only month filter
                 filtered_docs = [
+                    doc for doc, keep in zip(retrieved_docs, month_filter) if keep
                 ]
             # Return the top k docs

util.py CHANGED Viewed

@@ -1,18 +1,23 @@
 from calendar import month_name
-from retriever import BuildRetriever, db_dir
 import json
 import os
 import re
-def get_sources():
     """
-    Return the source files indexed in the database, e.g. 'R-help/2024-April.txt'.
     """
-    # Path to your JSON Lines file
-    file_path = os.path.join(db_dir, "bm25", "corpus.jsonl")
-    # Reading the JSON Lines file
     with open(file_path, "r", encoding="utf-8") as file:
         # Parse each line as a JSON object
         sources = [json.loads(line.strip())["metadata"]["source"] for line in file]
@@ -24,11 +29,13 @@ def get_start_end_months(sources):
     """
     Given a set of filenames like 'R-help/2024-January.txt', return the earliest and latest month in 'Month YYYY' format.
     """
-    pattern = re.compile(r"R-help/(\d{4})-([A-Za-z]+)\.txt")
     months = []
-    # Start with the unique sources
-    unique_sources = set(sources)
-    for src in unique_sources:
         m = pattern.match(src)
         if m:
             year = int(m.group(1))

 from calendar import month_name
 import json
 import os
 import re
+def get_sources(db_dir, collection):
     """
+    Return the source files for all emails indexed in the database.
+    The source file names look like 'R-help/2024-April.txt' and are repeated
+    for as many tims as there are indexed emails from each source file.
+    Args:
+        db_dir: Database directory
+        collection: Email collection
     """
+    # Path to the JSON Lines file
+    file_path = os.path.join(db_dir, collection, "bm25", "corpus.jsonl")
+    # Read the JSON Lines file
     with open(file_path, "r", encoding="utf-8") as file:
         # Parse each line as a JSON object
         sources = [json.loads(line.strip())["metadata"]["source"] for line in file]
     """
     Given a set of filenames like 'R-help/2024-January.txt', return the earliest and latest month in 'Month YYYY' format.
     """
+    # Get just the file names (e.g. 2024-January.txt)
+    filenames = [os.path.basename(source) for source in sources]
+    pattern = re.compile(r"(\d{4})-([A-Za-z]+)\.txt")
     months = []
+    # Start with the unique filenames
+    unique_filenames = set(filenames)
+    for src in unique_filenames:
         m = pattern.match(src)
         if m:
             year = int(m.group(1))