Spaces:

jedick
/

R-help-chat

Running

App Files Files Community

jedick commited on Jan 2

Commit

429393a

1 Parent(s): 8627eb1

Remove local compute mode

Browse files

Files changed (12) hide show

app.py +67 -179
eval.py +5 -14
graph.py +12 -73
images/graph_LR.png +0 -0
index.py +4 -5
main.py +13 -96
mods/tool_calling_llm.py +0 -313
pipeline.py +0 -86
prompts.py +7 -66
requirements.txt +13 -38
retriever.py +11 -58
util.py +0 -15

app.py CHANGED Viewed

@@ -1,42 +1,25 @@
 from langgraph.checkpoint.memory import MemorySaver
-from huggingface_hub import snapshot_download
 from dotenv import load_dotenv
 from datetime import datetime
 import gradio as gr
-import spaces
-import torch
 import uuid
 import ast
 import os
 import re
 # Local modules
-from main import GetChatModel, openai_model, model_id
 from util import get_sources, get_start_end_months
-from retriever import db_dir, embedding_model_id
-from mods.tool_calling_llm import extract_think
 from data import download_data, extract_data
 from graph import BuildGraph
 # Set environment variables
 load_dotenv(dotenv_path=".env", override=True)
 # Hide BM25S progress bars
 os.environ["DISABLE_TQDM"] = "true"
-# Download model snapshots from Hugging Face Hub
-if torch.cuda.is_available():
-    print(f"Downloading checkpoints for {model_id}...")
-    ckpt_dir = snapshot_download(model_id, local_dir_use_symlinks=False)
-    print(f"Using checkpoints from {ckpt_dir}")
-    print(f"Downloading checkpoints for {embedding_model_id}...")
-    embedding_ckpt_dir = snapshot_download(
-        embedding_model_id, local_dir_use_symlinks=False
-    )
-    print(f"Using embedding checkpoints from {embedding_ckpt_dir}")
-else:
-    ckpt_dir = None
-    embedding_ckpt_dir = None
 # Download and extract data if data directory is not present
 if not os.path.isdir(db_dir):
     print("Downloading data ... ", end="")
@@ -51,17 +34,35 @@ search_type = "hybrid"
 # Global variables for LangChain graph: use dictionaries to store user-specific instances
 # https://www.gradio.app/guides/state-in-blocks
-graph_instances = {"local": {}, "remote": {}}
 def cleanup_graph(request: gr.Request):
     timestamp = datetime.now().replace(microsecond=0).isoformat()
-    if request.session_hash in graph_instances["local"]:
-        del graph_instances["local"][request.session_hash]
-        print(f"{timestamp} - Delete local graph for session {request.session_hash}")
-    if request.session_hash in graph_instances["remote"]:
-        del graph_instances["remote"][request.session_hash]
-        print(f"{timestamp} - Delete remote graph for session {request.session_hash}")
 def append_content(chunk_messages, history, thinking_about):
@@ -85,48 +86,32 @@ def append_content(chunk_messages, history, thinking_about):
     return history
-def run_workflow(input, history, compute_mode, thread_id, session_hash):
     """The main function to run the chat workflow"""
-    # Error if user tries to run local mode without GPU
-    if compute_mode == "local":
-        if not torch.cuda.is_available():
-            raise gr.Error(
-                "Local mode requires GPU.",
-                print_exception=False,
-            )
     # Get graph instance
-    graph = graph_instances[compute_mode].get(session_hash)
     if graph is None:
-        # Notify when we're loading the local model because it takes some time
-        if compute_mode == "local":
-            gr.Info(
-                f"Please wait for the local model to load",
-                title=f"Model loading...",
-            )
         # Get the chat model and build the graph
-        chat_model = GetChatModel(compute_mode, ckpt_dir)
         graph_builder = BuildGraph(
             chat_model,
-            compute_mode,
             search_type,
-            embedding_ckpt_dir=embedding_ckpt_dir,
         )
         # Compile the graph with an in-memory checkpointer
         memory = MemorySaver()
         graph = graph_builder.compile(checkpointer=memory)
-        # Set global graph for compute mode
-        graph_instances[compute_mode][session_hash] = graph
         # ISO 8601 timestamp with local timezone information without microsecond
         timestamp = datetime.now().replace(microsecond=0).isoformat()
-        print(f"{timestamp} - Set {compute_mode} graph for session {session_hash}")
-        # Notify when model finishes loading
-        gr.Success(f"{compute_mode}", duration=4, title=f"Model loaded!")
     else:
         timestamp = datetime.now().replace(microsecond=0).isoformat()
-        print(f"{timestamp} - Get {compute_mode} graph for session {session_hash}")
     # print(f"Using thread_id: {thread_id}")
@@ -235,28 +220,11 @@ def run_workflow(input, history, compute_mode, thread_id, session_hash):
 def to_workflow(request: gr.Request, *args):
-    """Wrapper function to call function with or without @spaces.GPU"""
     input = args[0]
-    compute_mode = args[2]
     # Add session_hash to arguments
     new_args = args + (request.session_hash,)
-    if compute_mode == "local":
-        # Call the workflow function with the @spaces.GPU decorator
-        for value in run_workflow_local(*new_args):
-            yield value
-    if compute_mode == "remote":
-        for value in run_workflow_remote(*new_args):
-            yield value
-@spaces.GPU(duration=100)
-def run_workflow_local(*args):
-    for value in run_workflow(*args):
-        yield value
-def run_workflow_remote(*args):
-    for value in run_workflow(*args):
         yield value
@@ -290,19 +258,6 @@ with gr.Blocks(
     # Define components
     # -----------------
-    compute_mode = gr.Radio(
-        choices=[
-            "local",
-            "remote",
-        ],
-        # Default to remote because it provides a better first impression for most people
-        # value=("local" if torch.cuda.is_available() else "remote"),
-        value="remote",
-        label="Compute Mode",
-        info="NOTE: remote mode **does not** use ZeroGPU",
-        render=False,
-    )
     loading_data = gr.Textbox(
         "Please wait for the email database to be downloaded and extracted.",
         max_lines=0,
@@ -332,14 +287,7 @@ with gr.Blocks(
     chatbot = gr.Chatbot(
         type="messages",
         show_label=False,
-        avatar_images=(
-            None,
-            (
-                "images/cloud.png"
-                if compute_mode.value == "remote"
-                else "images/chip.png"
-            ),
-        ),
         show_copy_all_button=True,
         render=False,
     )
@@ -398,24 +346,17 @@ with gr.Blocks(
             and generates an answer from the retrieved emails (*emails are shown below the chatbot*).
             You can ask follow-up questions with the chat history as context.
             Press the clear button (🗑) to clear the history and start a new chat.
             """
         return intro
-    def get_status_text(compute_mode):
-        if compute_mode == "remote":
-            status_text = f"""
-            🌐 Now in **remote** mode, using the OpenAI API<br>
-            ⚠️ **_Privacy Notice_**: Data sharing with OpenAI is enabled<br>
-            ✨ text-embedding-3-small and {openai_model}<br>
-            🏠 See the project's [GitHub repository](https://github.com/jedick/R-help-chat)
-            """
-        if compute_mode == "local":
-            status_text = f"""
-            📍 Now in **local** mode, using ZeroGPU hardware<br>
-            ⌛ Response time is about one minute<br>
-            ✨ [{embedding_model_id.split("/")[-1]}](https://huggingface.co/{embedding_model_id}) and [{model_id.split("/")[-1]}](https://huggingface.co/{model_id})<br>
-            🏠 See the project's [GitHub repository](https://github.com/jedick/R-help-chat)
-            """
         return status_text
     def get_info_text():
@@ -430,13 +371,13 @@ with gr.Blocks(
             end = None
         info_text = f"""
             **Database:** {len(sources)} emails from {start} to {end}.
-            **Features:** RAG, today's date, hybrid search (dense+sparse), multiple retrievals, citations output (remote), chat memory.
-            **Tech:** LangChain + Hugging Face + Gradio; ChromaDB and BM25S-based retrievers.<br>
             """
         return info_text
-    def get_example_questions(compute_mode, as_dataset=True):
-        """Get example questions based on compute mode"""
         questions = [
             # "What is today's date?",
             "Summarize emails from the most recent two months",
@@ -445,15 +386,11 @@ with gr.Blocks(
             "Who reported installation problems in 2023-2024?",
         ]
-        ## Remove "/think" from questions in remote mode
-        # if compute_mode == "remote":
-        #     questions = [q.replace(" /think", "") for q in questions]
         # cf. https://github.com/gradio-app/gradio/pull/8745 for updating examples
         return gr.Dataset(samples=[[q] for q in questions]) if as_dataset else questions
-    def get_multi_tool_questions(compute_mode, as_dataset=True):
-        """Get multi-tool example questions based on compute mode"""
         questions = [
             "Differences between lapply and for loops",
             "Discuss pipe operator usage in 2022, 2023, and 2024",
@@ -461,8 +398,8 @@ with gr.Blocks(
         return gr.Dataset(samples=[[q] for q in questions]) if as_dataset else questions
-    def get_multi_turn_questions(compute_mode, as_dataset=True):
-        """Get multi-turn example questions based on compute mode"""
         questions = [
             "Lookup emails that reference bugs.r-project.org in 2025",
             "Did the authors you cited report bugs before 2025?",
@@ -474,10 +411,14 @@ with gr.Blocks(
         # Left column: Intro, Compute, Chat
         with gr.Column(scale=2):
             with gr.Row(elem_classes=["row-container"]):
-                with gr.Column(scale=2):
                     intro = gr.Markdown(get_intro_text())
                 with gr.Column(scale=1):
-                    compute_mode.render()
             with gr.Group() as chat_interface:
                 chatbot.render()
                 input.render()
@@ -488,29 +429,23 @@ with gr.Blocks(
             missing_data.render()
         # Right column: Info, Examples
         with gr.Column(scale=1):
-            status = gr.Markdown(get_status_text(compute_mode.value))
             with gr.Accordion("ℹ️ More Info", open=False):
                 info = gr.Markdown(get_info_text())
             with gr.Accordion("💡 Examples", open=True):
                 # Add some helpful examples
                 example_questions = gr.Examples(
-                    examples=get_example_questions(
-                        compute_mode.value, as_dataset=False
-                    ),
                     inputs=[input],
                     label="Click an example to fill the message box",
                 )
                 multi_tool_questions = gr.Examples(
-                    examples=get_multi_tool_questions(
-                        compute_mode.value, as_dataset=False
-                    ),
                     inputs=[input],
                     label="Multiple retrievals",
                 )
                 multi_turn_questions = gr.Examples(
-                    examples=get_multi_turn_questions(
-                        compute_mode.value, as_dataset=False
-                    ),
                     inputs=[input],
                     label="Asking follow-up questions",
                 )
@@ -530,18 +465,6 @@ with gr.Blocks(
         """Return updated value for a component"""
         return gr.update(value=value)
-    def set_avatar(compute_mode):
-        if compute_mode == "remote":
-            image_file = "images/cloud.png"
-        if compute_mode == "local":
-            image_file = "images/chip.png"
-        return gr.update(
-            avatar_images=(
-                None,
-                image_file,
-            ),
-        )
     def change_visibility(visible):
         """Return updated visibility state for a component"""
         return gr.update(visible=visible)
@@ -565,45 +488,10 @@ with gr.Blocks(
     # https://github.com/gradio-app/gradio/issues/9722
     chatbot.clear(generate_thread_id, outputs=[thread_id], api_name=False)
-    def clear_component(component):
-        """Return cleared component"""
-        return component.clear()
-    compute_mode.change(
-        # Start a new thread
-        generate_thread_id,
-        outputs=[thread_id],
-        api_name=False,
-    ).then(
-        # Focus textbox by updating the textbox with the current value
-        lambda x: gr.update(value=x),
-        [input],
-        [input],
-        api_name=False,
-    ).then(
-        # Change the app status text
-        get_status_text,
-        [compute_mode],
-        [status],
-        api_name=False,
-    ).then(
-        # Clear the chatbot history
-        clear_component,
-        [chatbot],
-        [chatbot],
-        api_name=False,
-    ).then(
-        # Change the chatbot avatar
-        set_avatar,
-        [compute_mode],
-        [chatbot],
-        api_name=False,
-    )
     input.submit(
         # Submit input to the chatbot
         to_workflow,
-        [input, chatbot, compute_mode, thread_id],
         [chatbot, retrieved_emails, citations_text],
         api_name=False,
     )

 from langgraph.checkpoint.memory import MemorySaver
+from langchain_openai import ChatOpenAI
 from dotenv import load_dotenv
 from datetime import datetime
 import gradio as gr
 import uuid
 import ast
 import os
 import re
 # Local modules
 from util import get_sources, get_start_end_months
 from data import download_data, extract_data
+from main import openai_model
 from graph import BuildGraph
+from retriever import db_dir
 # Set environment variables
 load_dotenv(dotenv_path=".env", override=True)
 # Hide BM25S progress bars
 os.environ["DISABLE_TQDM"] = "true"
 # Download and extract data if data directory is not present
 if not os.path.isdir(db_dir):
     print("Downloading data ... ", end="")
 # Global variables for LangChain graph: use dictionaries to store user-specific instances
 # https://www.gradio.app/guides/state-in-blocks
+graph_instances = {}
 def cleanup_graph(request: gr.Request):
     timestamp = datetime.now().replace(microsecond=0).isoformat()
+    if request.session_hash in graph_instances:
+        del graph_instances[request.session_hash]
+        print(f"{timestamp} - Delete graph for session {request.session_hash}")
+def extract_think(content):
+    # Added by Cursor 20250726 jmd
+    # Extract content within <think>...</think>
+    think_match = re.search(r"<think>(.*?)</think>", content, re.DOTALL)
+    think_text = think_match.group(1).strip() if think_match else ""
+    # Extract text after </think>
+    if think_match:
+        post_think = content[think_match.end() :].lstrip()
+    else:
+        # Check if content starts with <think> but missing closing tag
+        if content.strip().startswith("<think>"):
+            # Extract everything after <think>
+            think_start = content.find("<think>") + len("<think>")
+            think_text = content[think_start:].strip()
+            post_think = ""
+        else:
+            # No <think> found, so return entire content as post_think
+            post_think = content
+    return think_text, post_think
 def append_content(chunk_messages, history, thinking_about):
     return history
+def run_workflow(input, history, thread_id, session_hash):
     """The main function to run the chat workflow"""
     # Get graph instance
+    graph = graph_instances.get(session_hash)
     if graph is None:
         # Get the chat model and build the graph
+        chat_model = ChatOpenAI(model=openai_model, temperature=0)
         graph_builder = BuildGraph(
             chat_model,
             search_type,
         )
         # Compile the graph with an in-memory checkpointer
         memory = MemorySaver()
         graph = graph_builder.compile(checkpointer=memory)
+        # Set global graph
+        graph_instances[session_hash] = graph
         # ISO 8601 timestamp with local timezone information without microsecond
         timestamp = datetime.now().replace(microsecond=0).isoformat()
+        print(f"{timestamp} - Set graph for session {session_hash}")
+        ## Notify when model finishes loading
+        # gr.Success("Model loaded!", duration=4)
     else:
         timestamp = datetime.now().replace(microsecond=0).isoformat()
+        print(f"{timestamp} - Get graph for session {session_hash}")
     # print(f"Using thread_id: {thread_id}")
 def to_workflow(request: gr.Request, *args):
+    """Wrapper function to call run_workflow() with session_hash"""
     input = args[0]
     # Add session_hash to arguments
     new_args = args + (request.session_hash,)
+    for value in run_workflow(*new_args):
         yield value
     # Define components
     # -----------------
     loading_data = gr.Textbox(
         "Please wait for the email database to be downloaded and extracted.",
         max_lines=0,
     chatbot = gr.Chatbot(
         type="messages",
         show_label=False,
+        avatar_images=(None, "images/cloud.png"),
         show_copy_all_button=True,
         render=False,
     )
             and generates an answer from the retrieved emails (*emails are shown below the chatbot*).
             You can ask follow-up questions with the chat history as context.
             Press the clear button (🗑) to clear the history and start a new chat.
+            🚧 Under construction: Select a mailing list to search, or use Auto to let the LLM choose.
             """
         return intro
+    def get_status_text():
+        status_text = f"""
+        🌐 This app uses the OpenAI API<br>
+        ⚠️ **_Privacy Notice_**: Data sharing with OpenAI is enabled<br>
+        ✨ text-embedding-3-small and {openai_model}<br>
+        🏠 More info: [R-help-chat GitHub repository](https://github.com/jedick/R-help-chat)
+        """
         return status_text
     def get_info_text():
             end = None
         info_text = f"""
             **Database:** {len(sources)} emails from {start} to {end}.
+            **Features:** RAG, today's date, hybrid search (dense+sparse), multiple retrievals, citations output, chat memory.
+            **Tech:** OpenAI API + LangGraph + Gradio; ChromaDB and BM25S-based retrievers.<br>
             """
         return info_text
+    def get_example_questions(as_dataset=True):
+        """Get example questions"""
         questions = [
             # "What is today's date?",
             "Summarize emails from the most recent two months",
             "Who reported installation problems in 2023-2024?",
         ]
         # cf. https://github.com/gradio-app/gradio/pull/8745 for updating examples
         return gr.Dataset(samples=[[q] for q in questions]) if as_dataset else questions
+    def get_multi_tool_questions(as_dataset=True):
+        """Get multi-tool example questions"""
         questions = [
             "Differences between lapply and for loops",
             "Discuss pipe operator usage in 2022, 2023, and 2024",
         return gr.Dataset(samples=[[q] for q in questions]) if as_dataset else questions
+    def get_multi_turn_questions(as_dataset=True):
+        """Get multi-turn example questions"""
         questions = [
             "Lookup emails that reference bugs.r-project.org in 2025",
             "Did the authors you cited report bugs before 2025?",
         # Left column: Intro, Compute, Chat
         with gr.Column(scale=2):
             with gr.Row(elem_classes=["row-container"]):
+                with gr.Column(scale=4):
                     intro = gr.Markdown(get_intro_text())
                 with gr.Column(scale=1):
+                    gr.Radio(
+                        ["Auto", "R-help", "R-devel", "R-pkg-devel"],
+                        label="Mailing List",
+                        interactive=False,
+                    )
             with gr.Group() as chat_interface:
                 chatbot.render()
                 input.render()
             missing_data.render()
         # Right column: Info, Examples
         with gr.Column(scale=1):
+            status = gr.Markdown(get_status_text())
             with gr.Accordion("ℹ️ More Info", open=False):
                 info = gr.Markdown(get_info_text())
             with gr.Accordion("💡 Examples", open=True):
                 # Add some helpful examples
                 example_questions = gr.Examples(
+                    examples=get_example_questions(as_dataset=False),
                     inputs=[input],
                     label="Click an example to fill the message box",
                 )
                 multi_tool_questions = gr.Examples(
+                    examples=get_multi_tool_questions(as_dataset=False),
                     inputs=[input],
                     label="Multiple retrievals",
                 )
                 multi_turn_questions = gr.Examples(
+                    examples=get_multi_turn_questions(as_dataset=False),
                     inputs=[input],
                     label="Asking follow-up questions",
                 )
         """Return updated value for a component"""
         return gr.update(value=value)
     def change_visibility(visible):
         """Return updated visibility state for a component"""
         return gr.update(visible=visible)
     # https://github.com/gradio-app/gradio/issues/9722
     chatbot.clear(generate_thread_id, outputs=[thread_id], api_name=False)
     input.submit(
         # Submit input to the chatbot
         to_workflow,
+        [input, chatbot, thread_id],
         [chatbot, retrieved_emails, citations_text],
         api_name=False,
     )

eval.py CHANGED Viewed

@@ -34,7 +34,7 @@ def load_questions_and_references(csv_path):
     return questions, references
-def build_eval_dataset(questions, references, compute_mode, workflow, search_type):
     """Build dataset for evaluation"""
     dataset = []
     for question, reference in zip(questions, references):
@@ -42,15 +42,15 @@ def build_eval_dataset(questions, references, compute_mode, workflow, search_typ
             if workflow == "chain":
                 print("\n\n--- Question ---")
                 print(question)
-                response = RunChain(question, compute_mode, search_type)
                 print("--- Response ---")
                 print(response)
                 # Retrieve context documents for a question
-                retriever = BuildRetriever(compute_mode, search_type)
                 docs = retriever.invoke(question)
                 retrieved_contexts = [doc.page_content for doc in docs]
             if workflow == "graph":
-                result = RunGraph(question, compute_mode, search_type)
                 retrieved_contexts = []
                 if "retrieved_emails" in result:
                     # Remove the source file names (e.g. R-help/2022-September.txt) as it confuses the evaluator
@@ -142,12 +142,6 @@ def main():
     parser = argparse.ArgumentParser(
         description="Evaluate RAG retrieval and generation."
     )
-    parser.add_argument(
-        "--compute_mode",
-        choices=["remote", "local"],
-        required=True,
-        help="Compute mode: remote or local.",
-    )
     parser.add_argument(
         "--workflow",
         choices=["chain", "graph"],
@@ -161,14 +155,11 @@ def main():
         help="Search type: dense, sparse, or hybrid.",
     )
     args = parser.parse_args()
-    compute_mode = args.compute_mode
     workflow = args.workflow
     search_type = args.search_type
     questions, references = load_questions_and_references("eval.csv")
-    dataset = build_eval_dataset(
-        questions, references, compute_mode, workflow, search_type
-    )
     evaluation_dataset = EvaluationDataset.from_list(dataset)
     # Set up LLM for evaluation

     return questions, references
+def build_eval_dataset(questions, references, workflow, search_type):
     """Build dataset for evaluation"""
     dataset = []
     for question, reference in zip(questions, references):
             if workflow == "chain":
                 print("\n\n--- Question ---")
                 print(question)
+                response = RunChain(question, search_type)
                 print("--- Response ---")
                 print(response)
                 # Retrieve context documents for a question
+                retriever = BuildRetriever(search_type)
                 docs = retriever.invoke(question)
                 retrieved_contexts = [doc.page_content for doc in docs]
             if workflow == "graph":
+                result = RunGraph(question, search_type)
                 retrieved_contexts = []
                 if "retrieved_emails" in result:
                     # Remove the source file names (e.g. R-help/2022-September.txt) as it confuses the evaluator
     parser = argparse.ArgumentParser(
         description="Evaluate RAG retrieval and generation."
     )
     parser.add_argument(
         "--workflow",
         choices=["chain", "graph"],
         help="Search type: dense, sparse, or hybrid.",
     )
     args = parser.parse_args()
     workflow = args.workflow
     search_type = args.search_type
     questions, references = load_questions_and_references("eval.csv")
+    dataset = build_eval_dataset(questions, references, workflow, search_type)
     evaluation_dataset = EvaluationDataset.from_list(dataset)
     # Set up LLM for evaluation

graph.py CHANGED Viewed

@@ -2,15 +2,13 @@ from langchain_core.messages import SystemMessage, ToolMessage, HumanMessage, AI
 from langgraph.graph import START, END, MessagesState, StateGraph
 from langchain_core.tools import tool
 from langgraph.prebuilt import ToolNode, tools_condition
-from langchain_huggingface import ChatHuggingFace
 from typing import Optional
 import datetime
 import os
 # Local modules
 from retriever import BuildRetriever
-from prompts import query_prompt, answer_prompt, generic_tools_template
-from mods.tool_calling_llm import ToolCallingLLM
 # For tracing (disabled)
 # os.environ["LANGSMITH_TRACING"] = "true"
@@ -105,48 +103,18 @@ def normalize_messages(messages, summaries_for=None):
     return messages
-def ToolifyHF(chat_model, system_message):
-    """
-    Get a Hugging Face model ready for bind_tools().
-    """
-    # Combine system prompt and tools template
-    tool_system_prompt_template = system_message + generic_tools_template
-    class HuggingFaceWithTools(ToolCallingLLM, ChatHuggingFace):
-        def __init__(self, **kwargs):
-            super().__init__(**kwargs)
-    chat_model = HuggingFaceWithTools(
-        llm=chat_model.llm,
-        tool_system_prompt_template=tool_system_prompt_template,
-    )
-    return chat_model
 def BuildGraph(
     chat_model,
-    compute_mode,
     search_type,
     top_k=6,
-    think_query=False,
-    think_answer=False,
-    local_citations=False,
-    embedding_ckpt_dir=None,
 ):
     """
     Build conversational RAG graph for email retrieval and answering with citations.
     Args:
-        chat_model: LangChain chat model from GetChatModel()
-        compute_mode: remote or local (for retriever)
         search_type: dense, sparse, or hybrid (for retriever)
         top_k: number of documents to retrieve
-        think_query: Whether to use thinking mode for the query (local model)
-        think_answer: Whether to use thinking mode for the answer (local model)
-        local_citations: Whether to use answer_with_citations() tool (local model)
-        embedding_ckpt_dir: Directory for embedding model checkpoint
     Based on:
         https://python.langchain.com/docs/how_to/qa_sources
@@ -158,7 +126,7 @@ def BuildGraph(
         # Build graph with chat model
         from langchain_openai import ChatOpenAI
         chat_model = ChatOpenAI(model="gpt-4o-mini")
-        graph = BuildGraph(chat_model, "remote", "hybrid")
         # Add simple in-memory checkpointer
         from langgraph.checkpoint.memory import MemorySaver
@@ -198,7 +166,10 @@ def BuildGraph(
             months (str, optional): One or more months separated by spaces
         """
         retriever = BuildRetriever(
-            compute_mode, search_type, top_k, start_year, end_year, embedding_ckpt_dir
         )
         # For now, just add the months to the search query
         if months:
@@ -230,55 +201,23 @@ def BuildGraph(
         """
         return answer, citations
-    # Add tools to the local or remote chat model
-    is_local = hasattr(chat_model, "model_id")
-    if is_local:
-        # For local models (ChatHuggingFace with SmolLM, Gemma, or Qwen)
-        query_model = ToolifyHF(
-            chat_model, query_prompt(chat_model, think=think_query)
-        ).bind_tools([retrieve_emails])
-        if local_citations:
-            answer_model = ToolifyHF(
-                chat_model,
-                answer_prompt(chat_model, think=think_answer, with_tools=True),
-            ).bind_tools([answer_with_citations])
-        else:
-            # Don't use answer_with_citations tool because responses with are sometimes unparseable
-            answer_model = chat_model
-    else:
-        # For remote model (OpenAI API)
-        query_model = chat_model.bind_tools([retrieve_emails])
-        answer_model = chat_model.bind_tools([answer_with_citations])
     # Initialize the graph object
     graph = StateGraph(MessagesState)
     def query(state: MessagesState):
         """Queries the retriever with the chat model"""
-        if is_local:
-            # Don't include the system message here because it's defined in ToolCallingLLM
-            messages = state["messages"]
-            messages = normalize_messages(messages)
-        else:
-            messages = [SystemMessage(query_prompt(chat_model))] + state["messages"]
         response = query_model.invoke(messages)
         return {"messages": response}
     def answer(state: MessagesState):
         """Generates an answer with the chat model"""
-        if is_local:
-            messages = state["messages"]
-            messages = normalize_messages(messages)
-            if not local_citations:
-                # Add the system message here if we're not using tools
-                messages = [
-                    SystemMessage(answer_prompt(chat_model, think=think_answer))
-                ] + messages
-        else:
-            messages = [
-                SystemMessage(answer_prompt(chat_model, with_tools=True))
-            ] + state["messages"]
         response = answer_model.invoke(messages)
         return {"messages": response}

 from langgraph.graph import START, END, MessagesState, StateGraph
 from langchain_core.tools import tool
 from langgraph.prebuilt import ToolNode, tools_condition
 from typing import Optional
 import datetime
 import os
 # Local modules
 from retriever import BuildRetriever
+from prompts import query_prompt, answer_prompt
 # For tracing (disabled)
 # os.environ["LANGSMITH_TRACING"] = "true"
     return messages
 def BuildGraph(
     chat_model,
     search_type,
     top_k=6,
 ):
     """
     Build conversational RAG graph for email retrieval and answering with citations.
     Args:
+        chat_model: LangChain chat model
         search_type: dense, sparse, or hybrid (for retriever)
         top_k: number of documents to retrieve
     Based on:
         https://python.langchain.com/docs/how_to/qa_sources
         # Build graph with chat model
         from langchain_openai import ChatOpenAI
         chat_model = ChatOpenAI(model="gpt-4o-mini")
+        graph = BuildGraph(chat_model, "hybrid")
         # Add simple in-memory checkpointer
         from langgraph.checkpoint.memory import MemorySaver
             months (str, optional): One or more months separated by spaces
         """
         retriever = BuildRetriever(
+            search_type,
+            top_k,
+            start_year,
+            end_year,
         )
         # For now, just add the months to the search query
         if months:
         """
         return answer, citations
+    # Add tools to the chat model
+    query_model = chat_model.bind_tools([retrieve_emails])
+    answer_model = chat_model.bind_tools([answer_with_citations])
     # Initialize the graph object
     graph = StateGraph(MessagesState)
     def query(state: MessagesState):
         """Queries the retriever with the chat model"""
+        messages = [SystemMessage(query_prompt())] + state["messages"]
         response = query_model.invoke(messages)
         return {"messages": response}
     def answer(state: MessagesState):
         """Generates an answer with the chat model"""
+        messages = [SystemMessage(answer_prompt())] + state["messages"]
         response = answer_model.invoke(messages)
         return {"messages": response}

images/graph_LR.png CHANGED Viewed

index.py CHANGED Viewed

@@ -9,14 +9,13 @@ from retriever import BuildRetriever, db_dir
 from mods.bm25s_retriever import BM25SRetriever
-def ProcessFile(file_path, search_type: str = "dense", compute_mode: str = "remote"):
     """
     Wrapper function to process file for dense or sparse search
     Args:
         file_path: File to process
         search_type: Type of search to use. Options: "dense", "sparse"
-        compute_mode: Compute mode for embeddings (remote or local)
     """
     # Preprocess: remove quoted lines and handle email boundaries
@@ -69,7 +68,7 @@ def ProcessFile(file_path, search_type: str = "dense", compute_mode: str = "remo
             ProcessFileSparse(truncated_temp_file, file_path)
         elif search_type == "dense":
             # Handle dense search with ChromaDB
-            ProcessFileDense(truncated_temp_file, file_path, compute_mode)
         else:
             raise ValueError(f"Unsupported search type: {search_type}")
     finally:
@@ -81,12 +80,12 @@ def ProcessFile(file_path, search_type: str = "dense", compute_mode: str = "remo
             pass
-def ProcessFileDense(cleaned_temp_file, file_path, compute_mode):
     """
     Process file for dense vector search using ChromaDB
     """
     # Get a retriever instance
-    retriever = BuildRetriever(compute_mode, "dense")
     # Load cleaned text file
     loader = TextLoader(cleaned_temp_file)
     documents = loader.load()

 from mods.bm25s_retriever import BM25SRetriever
+def ProcessFile(file_path, search_type: str = "dense"):
     """
     Wrapper function to process file for dense or sparse search
     Args:
         file_path: File to process
         search_type: Type of search to use. Options: "dense", "sparse"
     """
     # Preprocess: remove quoted lines and handle email boundaries
             ProcessFileSparse(truncated_temp_file, file_path)
         elif search_type == "dense":
             # Handle dense search with ChromaDB
+            ProcessFileDense(truncated_temp_file, file_path)
         else:
             raise ValueError(f"Unsupported search type: {search_type}")
     finally:
             pass
+def ProcessFileDense(cleaned_temp_file, file_path):
     """
     Process file for dense vector search using ChromaDB
     """
     # Get a retriever instance
+    retriever = BuildRetriever("dense")
     # Load cleaned text file
     loader = TextLoader(cleaned_temp_file)
     documents = loader.load()

main.py CHANGED Viewed

@@ -5,20 +5,15 @@ from langchain_core.prompts import ChatPromptTemplate
 from langgraph.checkpoint.memory import MemorySaver
 from langchain_core.messages import SystemMessage
 from langchain_core.messages import ToolMessage
 from dotenv import load_dotenv
 from datetime import datetime
 import logging
-import torch
 import glob
 import ast
 import os
-# Imports for local and remote chat models
-from langchain_huggingface import ChatHuggingFace, HuggingFacePipeline
-from langchain_openai import ChatOpenAI
 # Local modules
-from pipeline import MyTextGenerationPipeline
 from retriever import BuildRetriever, db_dir
 from prompts import answer_prompt
 from index import ProcessFile
@@ -32,16 +27,9 @@ from graph import BuildGraph
 # Setup environment variables
 load_dotenv(dotenv_path=".env", override=True)
-# Define the remote (OpenAI) model
 openai_model = "gpt-4o-mini"
-# Get the local model ID
-model_id = os.getenv("MODEL_ID")
-if model_id is None:
-    # model_id = "HuggingFaceTB/SmolLM3-3B"
-    model_id = "google/gemma-3-12b-it"
-    # model_id = "Qwen/Qwen3-14B"
 # Suppress these messages:
 # INFO:httpx:HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"
 # INFO:httpx:HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
@@ -50,30 +38,29 @@ httpx_logger = logging.getLogger("httpx")
 httpx_logger.setLevel(logging.WARNING)
-def ProcessDirectory(path, compute_mode):
     """
     Update vector store and sparse index for files in a directory, only adding new or updated files
     Args:
         path: Directory to process
-        compute_mode: Compute mode for embeddings (remote or local)
     Usage example:
-        ProcessDirectory("R-help", "remote")
     """
     # TODO: use UUID to process only changed documents
     # https://stackoverflow.com/questions/76265631/chromadb-add-single-document-only-if-it-doesnt-exist
     # Get a dense retriever instance
-    retriever = BuildRetriever(compute_mode, "dense")
     # List all text files in target directory
     file_paths = glob.glob(f"{path}/*.txt")
     for file_path in file_paths:
         # Process file for sparse search (BM25S)
-        ProcessFile(file_path, "sparse", compute_mode)
         # Logic for dense search: skip file if already indexed
         # Look for existing embeddings for this file
@@ -103,7 +90,7 @@ def ProcessDirectory(path, compute_mode):
                 update_file = True
         if add_file:
-            ProcessFile(file_path, "dense", compute_mode)
         if update_file:
             print(f"Chroma: updated embeddings for {file_path}")
@@ -114,7 +101,7 @@ def ProcessDirectory(path, compute_mode):
             ]
             files_to_keep = list(set(used_doc_ids))
             # Get all files in the file store
-            file_store = f"{db_dir}/file_store_{compute_mode}"
             all_files = os.listdir(file_store)
             # Iterate through the files and delete those not in the list
             for file in all_files:
@@ -127,93 +114,32 @@ def ProcessDirectory(path, compute_mode):
             print(f"Chroma: no change for {file_path}")
-def GetChatModel(compute_mode, ckpt_dir=None):
-    """
-    Get a chat model.
-    Args:
-        compute_mode: Compute mode for chat model (remote or local)
-        ckpt_dir: Checkpoint directory for model weights (optional)
-    """
-    if compute_mode == "remote":
-        chat_model = ChatOpenAI(model=openai_model, temperature=0)
-    if compute_mode == "local":
-        # Don't try to use local models without a GPU
-        if compute_mode == "local" and not torch.cuda.is_available():
-            raise Exception("Local chat model selected without GPU")
-        # Define the pipeline to pass to the HuggingFacePipeline class
-        # https://huggingface.co/blog/langchain
-        id_or_dir = ckpt_dir if ckpt_dir else model_id
-        tokenizer = AutoTokenizer.from_pretrained(id_or_dir)
-        model = AutoModelForCausalLM.from_pretrained(
-            id_or_dir,
-            # We need this to load the model in BF16 instead of fp32 (torch.float)
-            torch_dtype=torch.bfloat16,
-            # Enable FlashAttention (requires pip install flash-attn)
-            # https://huggingface.co/docs/transformers/en/attention_interface
-            # https://huggingface.co/docs/transformers/perf_infer_gpu_one#flashattention
-            # attn_implementation="flash_attention_2",
-        )
-        # For Flash Attention version of Qwen3
-        tokenizer.padding_side = "left"
-        # Use MyTextGenerationPipeline with custom preprocess() method
-        pipe = MyTextGenerationPipeline(
-            model=model,
-            tokenizer=tokenizer,
-            # ToolCallingLLM needs return_full_text=False in order to parse just the assistant response
-            return_full_text=False,
-            # It seems that max_new_tokens has to be specified here, not in .invoke()
-            max_new_tokens=2000,
-            # Use padding for proper alignment for FlashAttention
-            # Part of fix for: "RuntimeError: p.attn_bias_ptr is not correctly aligned"
-            # https://github.com/google-deepmind/gemma/issues/169
-            padding="longest",
-        )
-        # We need the task so HuggingFacePipeline can deal with our class
-        pipe.task = "text-generation"
-        llm = HuggingFacePipeline(pipeline=pipe)
-        chat_model = ChatHuggingFace(llm=llm)
-    return chat_model
 def RunChain(
     query,
-    compute_mode: str = "remote",
     search_type: str = "hybrid",
-    think: bool = False,
 ):
     """
     Run chain to retrieve documents and send to chat
     Args:
         query: User's query
-        compute_mode: Compute mode for embedding and chat models (remote or local)
         search_type: Type of search to use. Options: "dense", "sparse", or "hybrid"
-        think: Control thinking mode for SmolLM3
     Example:
         RunChain("What R functions are discussed?")
     """
     # Get retriever instance
-    retriever = BuildRetriever(compute_mode, search_type)
     if retriever is None:
         return "No retriever available. Please process some documents first."
     # Get chat model (LLM)
-    chat_model = GetChatModel(compute_mode)
-    # Get prompt with /no_think for SmolLM3/Qwen
-    system_prompt = answer_prompt(chat_model)
     # Create a prompt template
     system_template = ChatPromptTemplate.from_messages([SystemMessage(system_prompt)])
@@ -244,22 +170,16 @@ def RunChain(
 def RunGraph(
     query: str,
-    compute_mode: str = "remote",
     search_type: str = "hybrid",
     top_k: int = 6,
-    think_query=False,
-    think_answer=False,
     thread_id=None,
 ):
     """Run graph for conversational RAG app
     Args:
         query: User query to start the chat
-        compute_mode: Compute mode for embedding and chat models (remote or local)
         search_type: Type of search to use. Options: "dense", "sparse", or "hybrid"
         top_k: Number of documents to retrieve
-        think_query: Whether to use thinking mode for the query
-        think_answer: Whether to use thinking mode for the answer
         thread_id: Thread ID for memory (optional)
     Example:
@@ -267,15 +187,12 @@ def RunGraph(
     """
     # Get chat model used in both query and generate steps
-    chat_model = GetChatModel(compute_mode)
     # Build the graph
     graph_builder = BuildGraph(
         chat_model,
-        compute_mode,
         search_type,
         top_k,
-        think_query,
-        think_answer,
     )
     # Compile the graph with an in-memory checkpointer

 from langgraph.checkpoint.memory import MemorySaver
 from langchain_core.messages import SystemMessage
 from langchain_core.messages import ToolMessage
+from langchain_openai import ChatOpenAI
 from dotenv import load_dotenv
 from datetime import datetime
 import logging
 import glob
 import ast
 import os
 # Local modules
 from retriever import BuildRetriever, db_dir
 from prompts import answer_prompt
 from index import ProcessFile
 # Setup environment variables
 load_dotenv(dotenv_path=".env", override=True)
+# Define the OpenAI model
 openai_model = "gpt-4o-mini"
 # Suppress these messages:
 # INFO:httpx:HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"
 # INFO:httpx:HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
 httpx_logger.setLevel(logging.WARNING)
+def ProcessDirectory(path):
     """
     Update vector store and sparse index for files in a directory, only adding new or updated files
     Args:
         path: Directory to process
     Usage example:
+        ProcessDirectory("R-help")
     """
     # TODO: use UUID to process only changed documents
     # https://stackoverflow.com/questions/76265631/chromadb-add-single-document-only-if-it-doesnt-exist
     # Get a dense retriever instance
+    retriever = BuildRetriever("dense")
     # List all text files in target directory
     file_paths = glob.glob(f"{path}/*.txt")
     for file_path in file_paths:
         # Process file for sparse search (BM25S)
+        ProcessFile(file_path, "sparse")
         # Logic for dense search: skip file if already indexed
         # Look for existing embeddings for this file
                 update_file = True
         if add_file:
+            ProcessFile(file_path, "dense")
         if update_file:
             print(f"Chroma: updated embeddings for {file_path}")
             ]
             files_to_keep = list(set(used_doc_ids))
             # Get all files in the file store
+            file_store = f"{db_dir}/file_store"
             all_files = os.listdir(file_store)
             # Iterate through the files and delete those not in the list
             for file in all_files:
             print(f"Chroma: no change for {file_path}")
 def RunChain(
     query,
     search_type: str = "hybrid",
 ):
     """
     Run chain to retrieve documents and send to chat
     Args:
         query: User's query
         search_type: Type of search to use. Options: "dense", "sparse", or "hybrid"
     Example:
         RunChain("What R functions are discussed?")
     """
     # Get retriever instance
+    retriever = BuildRetriever(search_type)
     if retriever is None:
         return "No retriever available. Please process some documents first."
     # Get chat model (LLM)
+    chat_model = ChatOpenAI(model=openai_model, temperature=0)
+    # Get system prompt
+    system_prompt = answer_prompt()
     # Create a prompt template
     system_template = ChatPromptTemplate.from_messages([SystemMessage(system_prompt)])
 def RunGraph(
     query: str,
     search_type: str = "hybrid",
     top_k: int = 6,
     thread_id=None,
 ):
     """Run graph for conversational RAG app
     Args:
         query: User query to start the chat
         search_type: Type of search to use. Options: "dense", "sparse", or "hybrid"
         top_k: Number of documents to retrieve
         thread_id: Thread ID for memory (optional)
     Example:
     """
     # Get chat model used in both query and generate steps
+    chat_model = ChatOpenAI(model=openai_model, temperature=0)
     # Build the graph
     graph_builder = BuildGraph(
         chat_model,
         search_type,
         top_k,
     )
     # Compile the graph with an in-memory checkpointer

mods/tool_calling_llm.py DELETED Viewed

@@ -1,313 +0,0 @@
-import re
-import json
-import uuid
-import warnings
-from abc import ABC
-from typing import (
-    Any,
-    AsyncIterator,
-    Callable,
-    Dict,
-    List,
-    Optional,
-    Sequence,
-    Tuple,
-    Type,
-    Union,
-    cast,
-)
-from langchain_core.callbacks import (
-    AsyncCallbackManagerForLLMRun,
-    CallbackManagerForLLMRun,
-)
-from langchain_core.language_models import BaseChatModel, LanguageModelInput
-from langchain_core.messages import (
-    SystemMessage,
-    AIMessage,
-    BaseMessage,
-    BaseMessageChunk,
-    ToolCall,
-)
-from langchain_core.outputs import ChatGeneration, ChatResult
-from langchain_core.prompts import SystemMessagePromptTemplate
-from pydantic import BaseModel
-from langchain_core.runnables import Runnable, RunnableConfig
-from langchain_core.tools import BaseTool
-from langchain_core.utils.function_calling import convert_to_openai_tool
-DEFAULT_SYSTEM_TEMPLATE = """You have access to the following tools:
-{tools}
-You must always select one of the above tools and respond with only a JSON object matching the following schema:
-{{
-  "tool": <name of selected tool 1>,
-  "tool_input": <parameters for selected tool 1, matching the tool's JSON schema>
-}},
-{{
-  "tool": <name of selected tool 2>,
-  "tool_input": <parameters for selected tool 2, matching the tool's JSON schema>
-}}
-"""  # noqa: E501
-def extract_think(content):
-    # Added by Cursor 20250726 jmd
-    # Extract content within <think>...</think>
-    think_match = re.search(r"<think>(.*?)</think>", content, re.DOTALL)
-    think_text = think_match.group(1).strip() if think_match else ""
-    # Extract text after </think>
-    if think_match:
-        post_think = content[think_match.end() :].lstrip()
-    else:
-        # Check if content starts with <think> but missing closing tag
-        if content.strip().startswith("<think>"):
-            # Extract everything after <think>
-            think_start = content.find("<think>") + len("<think>")
-            think_text = content[think_start:].strip()
-            post_think = ""
-        else:
-            # No <think> found, so return entire content as post_think
-            post_think = content
-    return think_text, post_think
-class ToolCallingLLM(BaseChatModel, ABC):
-    """ToolCallingLLM mixin to enable tool calling features on non tool calling models.
-    Note: This is an incomplete mixin and should not be used directly. It must be used to extent an existing Chat Model.
-    Setup:
-      Install dependencies for your Chat Model.
-      Any API Keys or setup needed for your Chat Model is still applicable.
-    Key init args — completion params:
-      Refer to the documentation of the Chat Model you wish to extend with Tool Calling.
-    Key init args — client params:
-      Refer to the documentation of the Chat Model you wish to extend with Tool Calling.
-    See full list of supported init args and their descriptions in the params section.
-    Instantiate:
-      ```
-      # Example implementation using LiteLLM
-      from langchain_community.chat_models import ChatLiteLLM
-      class LiteLLMFunctions(ToolCallingLLM, ChatLiteLLM):
-          def __init__(self, **kwargs: Any) -> None:
-              super().__init__(**kwargs)
-          @property
-          def _llm_type(self) -> str:
-              return "litellm_functions"
-      llm = LiteLLMFunctions(model="ollama/phi3")
-      ```
-    Invoke:
-      ```
-      messages = [
-        ("human", "What is the capital of France?")
-      ]
-      llm.invoke(messages)
-      ```
-      ```
-      AIMessage(content='The capital of France is Paris.', id='run-497d0e1a-d63b-45e8-9c8b-5e76d99b9468-0')
-      ```
-    Tool calling:
-      ```
-      from pydantic import BaseModel, Field
-      class GetWeather(BaseModel):
-          '''Get the current weather in a given location'''
-          location: str = Field(..., description="The city and state, e.g. San Francisco, CA")
-      class GetPopulation(BaseModel):
-          '''Get the current population in a given location'''
-          location: str = Field(..., description="The city and state, e.g. San Francisco, CA")
-      llm_with_tools = llm.bind_tools([GetWeather, GetPopulation])
-      ai_msg = llm_with_tools.invoke("Which city is hotter today and which is bigger: LA or NY?")
-      ai_msg.tool_calls
-      ```
-      ```
-      [{'name': 'GetWeather', 'args': {'location': 'Austin, TX'}, 'id': 'call_25ed526917b94d8fa5db3fe30a8cf3c0'}]
-      ```
-    Response metadata
-      Refer to the documentation of the Chat Model you wish to extend with Tool Calling.
-    """  # noqa: E501
-    tool_system_prompt_template: str = DEFAULT_SYSTEM_TEMPLATE
-    def __init__(self, **kwargs: Any) -> None:
-        super().__init__(**kwargs)
-    def _generate_system_message_and_functions(
-        self,
-        kwargs: Dict[str, Any],
-    ) -> Tuple[BaseMessage, List]:
-        functions = kwargs.get("tools", [])
-        # Convert functions to OpenAI tool schema
-        functions = [convert_to_openai_tool(fn) for fn in functions]
-        # Create system message with tool descriptions
-        system_message_prompt_template = SystemMessagePromptTemplate.from_template(
-            self.tool_system_prompt_template
-        )
-        system_message = system_message_prompt_template.format(
-            tools=json.dumps(functions, indent=2)
-        )
-        return system_message, functions
-    def _process_response(
-        self, response_message: BaseMessage, functions: List[Dict]
-    ) -> AIMessage:
-        if not isinstance(response_message.content, str):
-            raise ValueError("ToolCallingLLM does not support non-string output.")
-        # Extract <think>...</think> content and text after </think> for further processing 20250726 jmd
-        think_text, post_think = extract_think(response_message.content)
-        ## For debugging
-        # print("post_think")
-        # print(post_think)
-        # Remove backticks around code blocks
-        post_think = re.sub(r"^```json", "", post_think)
-        post_think = re.sub(r"^```", "", post_think)
-        post_think = re.sub(r"```$", "", post_think)
-        # Remove intervening backticks from adjacent code blocks
-        post_think = re.sub(r"```\n```json", ",", post_think)
-        # Remove trailing comma (if there is one)
-        post_think = post_think.rstrip(",")
-        # Parse output for JSON (support multiple objects separated by commas)
-        try:
-            # Works for one JSON object, or multiple JSON objects enclosed in "[]"
-            parsed_json_results = json.loads(f"{post_think}")
-            if not isinstance(parsed_json_results, list):
-                parsed_json_results = [parsed_json_results]
-        except:
-            try:
-                # Works for multiple JSON objects not enclosed in "[]"
-                parsed_json_results = json.loads(f"[{post_think}]")
-            except json.JSONDecodeError:
-                # Return entire response if JSON wasn't parsed or is missing
-                return AIMessage(content=response_message.content)
-        # print("parsed_json_results")
-        # print(parsed_json_results)
-        tool_calls = []
-        for parsed_json_result in parsed_json_results:
-            # Get tool name from output
-            called_tool_name = (
-                parsed_json_result["tool"]
-                if "tool" in parsed_json_result
-                else (
-                    parsed_json_result["name"] if "name" in parsed_json_result else None
-                )
-            )
-            # Check if tool name is in functions list
-            called_tool = next(
-                (fn for fn in functions if fn["function"]["name"] == called_tool_name),
-                None,
-            )
-            if called_tool is None:
-                # Issue a warning and skip this tool call
-                warnings.warn(f"Called tool ({called_tool_name}) not in functions list")
-                continue
-            # Get tool arguments from output
-            called_tool_arguments = (
-                parsed_json_result["tool_input"]
-                if "tool_input" in parsed_json_result
-                else (
-                    parsed_json_result["parameters"]
-                    if "parameters" in parsed_json_result
-                    else {}
-                )
-            )
-            tool_calls.append(
-                ToolCall(
-                    name=called_tool_name,
-                    args=called_tool_arguments,
-                    id=f"call_{str(uuid.uuid4()).replace('-', '')}",
-                )
-            )
-        if not tool_calls:
-            # If nothing valid, return original content
-            return AIMessage(content=response_message.content)
-        # Put together response message
-        response_message = AIMessage(
-            content=f"<think>\n{think_text}\n</think>",
-            tool_calls=tool_calls,
-        )
-        return response_message
-    def _generate(
-        self,
-        messages: List[BaseMessage],
-        stop: Optional[List[str]] = None,
-        run_manager: Optional[CallbackManagerForLLMRun] = None,
-        **kwargs: Any,
-    ) -> ChatResult:
-        system_message, functions = self._generate_system_message_and_functions(kwargs)
-        response_message = super()._generate(  # type: ignore[safe-super]
-            [system_message] + messages, stop=stop, run_manager=run_manager, **kwargs
-        )
-        response = self._process_response(
-            response_message.generations[0].message, functions
-        )
-        return ChatResult(generations=[ChatGeneration(message=response)])
-    async def _agenerate(
-        self,
-        messages: List[BaseMessage],
-        stop: Optional[List[str]] = None,
-        run_manager: Optional[AsyncCallbackManagerForLLMRun] = None,
-        **kwargs: Any,
-    ) -> ChatResult:
-        system_message, functions = self._generate_system_message_and_functions(kwargs)
-        response_message = await super()._agenerate(
-            [system_message] + messages, stop=stop, run_manager=run_manager, **kwargs
-        )
-        response = self._process_response(
-            response_message.generations[0].message, functions
-        )
-        return ChatResult(generations=[ChatGeneration(message=response)])
-    async def astream(
-        self,
-        input: LanguageModelInput,
-        config: Optional[RunnableConfig] = None,
-        *,
-        stop: Optional[List[str]] = None,
-        **kwargs: Any,
-    ) -> AsyncIterator[BaseMessageChunk]:
-        system_message, functions = self._generate_system_message_and_functions(kwargs)
-        generation: Optional[BaseMessageChunk] = None
-        async for chunk in super().astream(
-            [system_message] + super()._convert_input(input).to_messages(),
-            stop=stop,
-            **kwargs,
-        ):
-            if generation is None:
-                generation = chunk
-            else:
-                generation += chunk
-        assert generation is not None
-        response = self._process_response(generation, functions)
-        yield cast(BaseMessageChunk, response)

pipeline.py DELETED Viewed

@@ -1,86 +0,0 @@
-from transformers.pipelines.text_generation import Chat
-from transformers import TextGenerationPipeline
-from typing import Dict
-class MyTextGenerationPipeline(TextGenerationPipeline):
-    """
-    This subclass overrides the preprocess method to add pad_to_multiple_of=8 to tokenizer_kwargs.
-    Fix for: "RuntimeError: p.attn_bias_ptr is not correctly aligned"
-    https://github.com/google-deepmind/gemma/issues/169
-    NOTE: we also need padding="longest", which is set during class instantiation
-    """
-    def preprocess(
-        self,
-        prompt_text,
-        prefix="",
-        handle_long_generation=None,
-        add_special_tokens=None,
-        truncation=None,
-        padding=None,
-        max_length=None,
-        continue_final_message=None,
-        **generate_kwargs,
-    ):
-        # Only set non-None tokenizer kwargs, so as to rely on the tokenizer's defaults
-        tokenizer_kwargs = {
-            "add_special_tokens": add_special_tokens,
-            "truncation": truncation,
-            "padding": padding,
-            "max_length": max_length,
-            "pad_to_multiple_of": 8,
-        }
-        tokenizer_kwargs = {
-            key: value for key, value in tokenizer_kwargs.items() if value is not None
-        }
-        if isinstance(prompt_text, Chat):
-            tokenizer_kwargs.pop(
-                "add_special_tokens", None
-            )  # ignore add_special_tokens on chats
-            # If the user passes a chat that ends in an assistant message, we treat it as a prefill by default
-            # because very few models support multiple separate, consecutive assistant messages
-            if continue_final_message is None:
-                continue_final_message = prompt_text.messages[-1]["role"] == "assistant"
-            inputs = self.tokenizer.apply_chat_template(
-                prompt_text.messages,
-                add_generation_prompt=not continue_final_message,
-                continue_final_message=continue_final_message,
-                return_dict=True,
-                return_tensors=self.framework,
-                **tokenizer_kwargs,
-            )
-        else:
-            inputs = self.tokenizer(
-                prefix + prompt_text, return_tensors=self.framework, **tokenizer_kwargs
-            )
-        inputs["prompt_text"] = prompt_text
-        if handle_long_generation == "hole":
-            cur_len = inputs["input_ids"].shape[-1]
-            if "max_new_tokens" in generate_kwargs:
-                new_tokens = generate_kwargs["max_new_tokens"]
-            else:
-                new_tokens = (
-                    generate_kwargs.get("max_length", self.generation_config.max_length)
-                    - cur_len
-                )
-                if new_tokens < 0:
-                    raise ValueError("We cannot infer how many new tokens are expected")
-            if cur_len + new_tokens > self.tokenizer.model_max_length:
-                keep_length = self.tokenizer.model_max_length - new_tokens
-                if keep_length <= 0:
-                    raise ValueError(
-                        "We cannot use `hole` to handle this generation the number of desired tokens exceeds the"
-                        " models max length"
-                    )
-                inputs["input_ids"] = inputs["input_ids"][:, -keep_length:]
-                if "attention_mask" in inputs:
-                    inputs["attention_mask"] = inputs["attention_mask"][
-                        :, -keep_length:
-                    ]
-        return inputs

prompts.py CHANGED Viewed

@@ -3,22 +3,16 @@ from util import get_sources, get_start_end_months
 import re
-def check_prompt(prompt, chat_model, think):
-    """Check for unassigned variables and add /no_think if needed"""
     # A sanity check that we don't have unassigned variables
-    # (this causes KeyError in parsing by ToolCallingLLM)
     matches = re.findall(r"\{.*?\}", " ".join(prompt))
     if matches:
         raise ValueError(f"Unassigned variables in prompt: {' '.join(matches)}")
-    # Check if we should add /no_think to turn off thinking mode
-    if hasattr(chat_model, "model_id"):
-        model_id = chat_model.model_id
-        if ("SmolLM" in model_id or "Qwen" in model_id) and not think:
-            prompt = "/no_think\n" + prompt
     return prompt
-def query_prompt(chat_model, think=False):
     """Return system prompt for query step"""
     # Get start and end months from database
@@ -43,12 +37,12 @@ def query_prompt(chat_model, think=False):
         # "Do not use your memory or knowledge to answer the user's question. Only retrieve emails based on the user's question. "  # Qwen
         # "If you decide not to retrieve emails, tell the user why and suggest how to improve their question to chat with the R-help mailing list. "
     )
-    prompt = check_prompt(prompt, chat_model, think)
     return prompt
-def answer_prompt(chat_model, think=False, with_tools=False):
     """Return system prompt for answer step"""
     prompt = (
         f"Today Date: {date.today()}. "
@@ -64,61 +58,8 @@ def answer_prompt(chat_model, think=False, with_tools=False):
         "Only answer general questions about R if the answer is in the retrieved emails. "
         "Only include URLs if they were used by human authors (not in email headers), and do not modify any URLs. "  # Qwen, Gemma
         "Respond with 500 words maximum and 50 lines of code maximum. "
     )
-    if with_tools:
-        prompt = (
-            f"{prompt}"
-            "Use answer_with_citations to provide the complete answer and all citations used. "
-        )
-    prompt = check_prompt(prompt, chat_model, think)
     return prompt
-# Prompt template for SmolLM3 with tools
-# The first two lines, <function-name>, and <args-json-object> are from the apply_chat_template for HuggingFaceTB/SmolLM3-3B
-# The other lines (You have, {tools}, You must), "tool", and "tool_input" are from tool_calling_llm.py
-smollm3_tools_template = """
-### Tools
-You may call one or more functions to assist with the user query.
-You have access to the following tools:
-{tools}
-You must always select one of the above tools and respond with only a JSON object matching the following schema:
-{{
-    "tool": <function-name>,
-    "tool_input": <args-json-object>
-}},
-{{
-    "tool": <function-name>,
-    "tool_input": <args-json-object>
-}}
-"""
-# Prompt template for Gemma/Qwen with tools
-# Based on https://ai.google.dev/gemma/docs/capabilities/function-calling
-generic_tools_template = """
-### Functions
-You have access to functions. If you decide to invoke any of the function(s), you MUST put it in the format of
-{{
-    "tool": <function-name>,
-    "tool_input": <args-json-object>
-}},
-{{
-    "tool": <function-name>,
-    "tool_input": <args-json-object>
-}}
-You SHOULD NOT include any other text in the response if you call a function
-{tools}
-"""

 import re
+def check_prompt(prompt):
+    """Check for unassigned variables"""
     # A sanity check that we don't have unassigned variables
     matches = re.findall(r"\{.*?\}", " ".join(prompt))
     if matches:
         raise ValueError(f"Unassigned variables in prompt: {' '.join(matches)}")
     return prompt
+def query_prompt():
     """Return system prompt for query step"""
     # Get start and end months from database
         # "Do not use your memory or knowledge to answer the user's question. Only retrieve emails based on the user's question. "  # Qwen
         # "If you decide not to retrieve emails, tell the user why and suggest how to improve their question to chat with the R-help mailing list. "
     )
+    prompt = check_prompt(prompt)
     return prompt
+def answer_prompt():
     """Return system prompt for answer step"""
     prompt = (
         f"Today Date: {date.today()}. "
         "Only answer general questions about R if the answer is in the retrieved emails. "
         "Only include URLs if they were used by human authors (not in email headers), and do not modify any URLs. "  # Qwen, Gemma
         "Respond with 500 words maximum and 50 lines of code maximum. "
+        "Use answer_with_citations to provide the complete answer and all citations used. "
     )
+    prompt = check_prompt(prompt)
     return prompt

requirements.txt CHANGED Viewed

@@ -1,25 +1,17 @@
-# Pin torch and chroma versions
-torch==2.5.1
 chromadb==0.6.3
 # NOTE: chromadb==1.0.13 was giving intermittent error:
 #   ValueError('Could not connect to tenant default_tenant. Are you sure it exists?')
-# FlashAttention
-#flash-attn==2.8.2
-# Stated requirements:
-#   Gemma 3: transformers>=4.50
-#   Qwen3:   transformers>=4.51
-#   SmolLM3: transformers>=4.53
-transformers==4.51.3
-tokenizers==0.21.2
-# Only needed with AutoModelForCausalLM.from_pretrained(device_map="auto")
-#accelerate==1.8.1
-# Required by langchain-huggingface
-sentence-transformers==5.0.0
-# For snapshot_download
-huggingface-hub==0.34.3
 # Langchain packages
 langchain==0.3.26
@@ -27,31 +19,14 @@ langchain-core==0.3.72
 langchain-chroma==0.2.3
 langchain-openai==0.3.27
 langchain-community==0.3.27
-langchain-huggingface==0.3.0
 langchain-text-splitters==0.3.8
 langgraph==0.4.7
 langgraph-sdk==0.1.72
 langgraph-prebuilt==0.5.2
 langgraph-checkpoint==2.1.0
-# Required by Nomic embeddings
-einops==0.8.1
-# Commented because we have local modifications
-#tool-calling-llm==0.1.2
-bm25s==0.2.12
 ragas==0.2.15
-# posthog<6.0.0 is temporary fix for ChromaDB telemetry error log messages
-# https://github.com/vanna-ai/vanna/issues/917
-posthog==5.4.0
-# Gradio for the web interface
 gradio==5.38.2
-spaces==0.37.1
-# For downloading data from S3
-boto3==1.39.14
-# Others
-python-dotenv==1.1.1

+# To load API keys
+python-dotenv==1.1.1
+# To download data from S3
+boto3==1.39.14
+# Retrieval
+bm25s==0.2.12
 chromadb==0.6.3
 # NOTE: chromadb==1.0.13 was giving intermittent error:
 #   ValueError('Could not connect to tenant default_tenant. Are you sure it exists?')
+# posthog<6.0.0 is temporary fix for ChromaDB telemetry error log messages
+# https://github.com/vanna-ai/vanna/issues/917
+posthog==5.4.0
 # Langchain packages
 langchain==0.3.26
 langchain-chroma==0.2.3
 langchain-openai==0.3.27
 langchain-community==0.3.27
 langchain-text-splitters==0.3.8
 langgraph==0.4.7
 langgraph-sdk==0.1.72
 langgraph-prebuilt==0.5.2
 langgraph-checkpoint==2.1.0
+# Evaluations
 ragas==0.2.15
+# Frontend
 gradio==5.38.2

retriever.py CHANGED Viewed

@@ -1,25 +1,17 @@
 # Main retriever modules
-from langchain_text_splitters import RecursiveCharacterTextSplitter
-from langchain_community.document_loaders import TextLoader
-from langchain_chroma import Chroma
 from langchain.retrievers import ParentDocumentRetriever, EnsembleRetriever
-from langchain_core.documents import Document
 from langchain_core.retrievers import BaseRetriever, RetrieverLike
 from langchain_core.callbacks import CallbackManagerForRetrieverRun
 from typing import Any, Optional
 import chromadb
-import torch
 import os
 import re
-# To use OpenAI models (remote)
-from langchain_openai import OpenAIEmbeddings
-## To use Hugging Face models (local)
-# from langchain_huggingface import HuggingFaceEmbeddings
-# For more control over BGE and Nomic embeddings
-from langchain_community.embeddings import HuggingFaceBgeEmbeddings
 # Local modules
 from mods.bm25s_retriever import BM25SRetriever
 from mods.file_system import LocalFileStore
@@ -27,41 +19,30 @@ from mods.file_system import LocalFileStore
 # Database directory
 db_dir = "db"
-# Embedding model
-embedding_model_id = "nomic-ai/nomic-embed-text-v1.5"
 def BuildRetriever(
-    compute_mode,
     search_type: str = "hybrid",
     top_k=6,
     start_year=None,
     end_year=None,
-    embedding_ckpt_dir=None,
 ):
     """
     Build retriever instance.
     All retriever types are configured to return up to 6 documents for fair comparison in evals.
     Args:
-        compute_mode: Compute mode for embeddings (remote or local)
         search_type: Type of search to use. Options: "dense", "sparse", "hybrid"
         top_k: Number of documents to retrieve for "dense" and "sparse"
         start_year: Start year (optional)
         end_year: End year (optional)
-        embedding_ckpt_dir: Directory for embedding model checkpoint
     """
     if search_type == "dense":
         if not (start_year or end_year):
             # No year filtering, so directly use base retriever
-            return BuildRetrieverDense(
-                compute_mode, top_k=top_k, embedding_ckpt_dir=embedding_ckpt_dir
-            )
         else:
             # Get 1000 documents then keep top_k filtered by year
-            base_retriever = BuildRetrieverDense(
-                compute_mode, top_k=1000, embedding_ckpt_dir=embedding_ckpt_dir
-            )
             return TopKRetriever(
                 base_retriever=base_retriever,
                 top_k=top_k,
@@ -85,20 +66,16 @@ def BuildRetriever(
         # Use floor (top_k // 2) and ceiling -(top_k // -2) to divide odd values of top_k
         # https://stackoverflow.com/questions/14822184/is-there-a-ceiling-equivalent-of-operator-in-python
         dense_retriever = BuildRetriever(
-            compute_mode,
             "dense",
             (top_k // 2),
             start_year,
             end_year,
-            embedding_ckpt_dir,
         )
         sparse_retriever = BuildRetriever(
-            compute_mode,
             "sparse",
             -(top_k // -2),
             start_year,
             end_year,
-            embedding_ckpt_dir,
         )
         ensemble_retriever = EnsembleRetriever(
             retrievers=[dense_retriever, sparse_retriever], weights=[1, 1]
@@ -128,43 +105,19 @@ def BuildRetrieverSparse(top_k=6):
     return retriever
-def BuildRetrieverDense(compute_mode: str, top_k=6, embedding_ckpt_dir=None):
     """
     Build dense retriever instance with ChromaDB vectorstore
     Args:
-        compute_mode: Compute mode for embeddings (remote or local)
         top_k: Number of documents to retrieve
-        embedding_ckpt_dir: Directory for embedding model checkpoint
     """
-    # Don't try to use local models without a GPU
-    if compute_mode == "local" and not torch.cuda.is_available():
-        raise Exception("Local embeddings selected without GPU")
     # Define embedding model
-    if compute_mode == "remote":
-        embedding_function = OpenAIEmbeddings(model="text-embedding-3-small")
-    if compute_mode == "local":
-        # embedding_function = HuggingFaceEmbeddings(model_name="BAAI/bge-large-en-v1.5", show_progress=True)
-        # https://python.langchain.com/api_reference/community/embeddings/langchain_community.embeddings.huggingface.HuggingFaceBgeEmbeddings.html
-        model_kwargs = {
-            "device": "cuda",
-            "trust_remote_code": True,
-        }
-        encode_kwargs = {"normalize_embeddings": True}
-        # Use embedding model ID or checkpoint directory if given
-        id_or_dir = embedding_ckpt_dir if embedding_ckpt_dir else embedding_model_id
-        embedding_function = HuggingFaceBgeEmbeddings(
-            model_name=id_or_dir,
-            model_kwargs=model_kwargs,
-            encode_kwargs=encode_kwargs,
-            query_instruction="search_query:",
-            embed_instruction="search_document:",
-        )
     # Create vector store
     client_settings = chromadb.config.Settings(anonymized_telemetry=False)
-    persist_directory = f"{db_dir}/chroma_{compute_mode}"
     vectorstore = Chroma(
         collection_name="R-help",
         embedding_function=embedding_function,
@@ -172,7 +125,7 @@ def BuildRetrieverDense(compute_mode: str, top_k=6, embedding_ckpt_dir=None):
         persist_directory=persist_directory,
     )
     # The storage layer for the parent documents
-    file_store = f"{db_dir}/file_store_{compute_mode}"
     byte_store = LocalFileStore(file_store)
     # Text splitter for child documents
     child_splitter = RecursiveCharacterTextSplitter(

 # Main retriever modules
 from langchain.retrievers import ParentDocumentRetriever, EnsembleRetriever
 from langchain_core.retrievers import BaseRetriever, RetrieverLike
 from langchain_core.callbacks import CallbackManagerForRetrieverRun
+from langchain_text_splitters import RecursiveCharacterTextSplitter
+from langchain_community.document_loaders import TextLoader
+from langchain_core.documents import Document
+from langchain_openai import OpenAIEmbeddings
+from langchain_chroma import Chroma
 from typing import Any, Optional
 import chromadb
 import os
 import re
 # Local modules
 from mods.bm25s_retriever import BM25SRetriever
 from mods.file_system import LocalFileStore
 # Database directory
 db_dir = "db"
 def BuildRetriever(
     search_type: str = "hybrid",
     top_k=6,
     start_year=None,
     end_year=None,
 ):
     """
     Build retriever instance.
     All retriever types are configured to return up to 6 documents for fair comparison in evals.
     Args:
         search_type: Type of search to use. Options: "dense", "sparse", "hybrid"
         top_k: Number of documents to retrieve for "dense" and "sparse"
         start_year: Start year (optional)
         end_year: End year (optional)
     """
     if search_type == "dense":
         if not (start_year or end_year):
             # No year filtering, so directly use base retriever
+            return BuildRetrieverDense(top_k=top_k)
         else:
             # Get 1000 documents then keep top_k filtered by year
+            base_retriever = BuildRetrieverDense(top_k=1000)
             return TopKRetriever(
                 base_retriever=base_retriever,
                 top_k=top_k,
         # Use floor (top_k // 2) and ceiling -(top_k // -2) to divide odd values of top_k
         # https://stackoverflow.com/questions/14822184/is-there-a-ceiling-equivalent-of-operator-in-python
         dense_retriever = BuildRetriever(
             "dense",
             (top_k // 2),
             start_year,
             end_year,
         )
         sparse_retriever = BuildRetriever(
             "sparse",
             -(top_k // -2),
             start_year,
             end_year,
         )
         ensemble_retriever = EnsembleRetriever(
             retrievers=[dense_retriever, sparse_retriever], weights=[1, 1]
     return retriever
+def BuildRetrieverDense(top_k=6):
     """
     Build dense retriever instance with ChromaDB vectorstore
     Args:
         top_k: Number of documents to retrieve
     """
     # Define embedding model
+    embedding_function = OpenAIEmbeddings(model="text-embedding-3-small")
     # Create vector store
     client_settings = chromadb.config.Settings(anonymized_telemetry=False)
+    persist_directory = f"{db_dir}/chroma"
     vectorstore = Chroma(
         collection_name="R-help",
         embedding_function=embedding_function,
         persist_directory=persist_directory,
     )
     # The storage layer for the parent documents
+    file_store = f"{db_dir}/file_store"
     byte_store = LocalFileStore(file_store)
     # Text splitter for child documents
     child_splitter = RecursiveCharacterTextSplitter(

util.py CHANGED Viewed

@@ -5,21 +5,6 @@ import os
 import re
-def get_collection(compute_mode):
-    """
-    Returns the vectorstore collection.
-    Usage Examples:
-        # Number of child documents
-        collection = get_collection("remote")
-        len(collection["ids"])
-        # Number of parent documents (unique doc_ids)
-        len(set([m["doc_id"] for m in collection["metadatas"]]))
-    """
-    retriever = BuildRetriever(compute_mode, "dense")
-    return retriever.vectorstore.get()
 def get_sources():
     """
     Return the source files indexed in the database, e.g. 'R-help/2024-April.txt'.

 import re
 def get_sources():
     """
     Return the source files indexed in the database, e.g. 'R-help/2024-April.txt'.