Spaces:

CandidAI
/

ask-candid

Running

App Files Files Community

brainsqueeze commited on Mar 7, 2025

Commit

cc80c3d

verified ·

1 Parent(s): bea5044

UI callbacks and style changes

Browse files

Files changed (16) hide show

app.py +2 -0
ask_candid/base/config/data.py +5 -14
ask_candid/chat.py +6 -1
ask_candid/graph.py +53 -28
ask_candid/retrieval/elastic.py +60 -117
ask_candid/retrieval/sources/candid_blog.py +7 -0
ask_candid/retrieval/sources/candid_help.py +7 -0
ask_candid/retrieval/sources/candid_learning.py +7 -0
ask_candid/retrieval/sources/candid_news.py +7 -0
ask_candid/retrieval/sources/issuelab.py +7 -0
ask_candid/retrieval/sources/schema.py +9 -0
ask_candid/retrieval/sources/youtube.py +8 -0
ask_candid/tools/elastic/list_indices_tool.py +2 -1
ask_candid/tools/org_seach.py +25 -12
ask_candid/tools/search.py +86 -2
ask_candid/utils.py +1 -1

app.py CHANGED Viewed

@@ -147,6 +147,8 @@ def build_rag_chat() -> Tuple[LoggedComponents, gr.Blocks]:
                 show_copy_button=True,
                 show_share_button=None,
                 show_copy_all_button=False,
             )
             msg = gr.MultimodalTextbox(label="Your message", interactive=True)
             thread_id = gr.Text(visible=False, value="", label="thread_id")

                 show_copy_button=True,
                 show_share_button=None,
                 show_copy_all_button=False,
+                autoscroll=True,
+                layout="panel",
             )
             msg = gr.MultimodalTextbox(label="Your message", interactive=True)
             thread_id = gr.Text(visible=False, value="", label="thread_id")

ask_candid/base/config/data.py CHANGED Viewed

@@ -1,21 +1,12 @@
-class ElasticIndexMapping:
-    "Mapping from plain name to Elasticsearch index name"
-    ISSUELAB_INDEX = "search-semantic-issuelab_v1"
-    ISSUELAB_INDEX_ELSER = "search-semantic-issuelab-elser_ve2"
-    YOUTUBE_INDEX = "search-semantic-youtube_v1"
-    YOUTUBE_INDEX_ELSER = "search-semantic-youtube-elser_ve1"
-    CANDID_BLOG_INDEX = "search-semantic-candid-blog_v1"
-    CANDID_BLOG_INDEX_ELSER = "search-semantic-candid-blog"
-    CANDID_LEARNING_INDEX_ELSER = "search-semantic-candid-learning_ve1"
-    CANDID_HELP_INDEX_ELSER = "search-semantic-candid-help-elser_ve1"
-ALL_INDICES = (
     "issuelab",
     "youtube",
     "candid_blog",
     "candid_learning",
     "candid_help",
     "news"
-)

+from typing import Literal, get_args
+DataIndices = Literal[
     "issuelab",
     "youtube",
     "candid_blog",
     "candid_learning",
     "candid_help",
     "news"
+]
+ALL_INDICES = get_args(DataIndices)

ask_candid/chat.py CHANGED Viewed

@@ -29,7 +29,12 @@ def run_chat(
     config = {"configurable": {"thread_id": thread_id}}
     enable_recommendations = "Recommendation" in premium_features
-    workflow = build_compute_graph(llm=llm, indices=indices, enable_recommendations=enable_recommendations)
     memory = MemorySaver()  # TODO: don't use for Prod
     graph = workflow.compile(checkpointer=memory)

     config = {"configurable": {"thread_id": thread_id}}
     enable_recommendations = "Recommendation" in premium_features
+    workflow = build_compute_graph(
+        llm=llm,
+        indices=indices,
+        user_callback=gr.Info,
+        enable_recommendations=enable_recommendations
+    )
     memory = MemorySaver()  # TODO: don't use for Prod
     graph = workflow.compile(checkpointer=memory)

ask_candid/graph.py CHANGED Viewed

@@ -1,4 +1,4 @@
-from typing import List
 from functools import partial
 import logging
@@ -11,7 +11,6 @@ from langgraph.prebuilt import tools_condition, ToolNode
 from langgraph.graph.state import StateGraph
 from langgraph.constants import START, END
-from ask_candid.retrieval.elastic import retriever_tool
 from ask_candid.tools.recommendation import (
     detect_intent_with_llm,
     determine_context,
@@ -19,8 +18,9 @@ from ask_candid.tools.recommendation import (
 )
 from ask_candid.tools.question_reformulation import reformulate_question_using_history
 from ask_candid.tools.org_seach import has_org_name, insert_org_link
-from ask_candid.tools.search import search_agent
 from ask_candid.agents.schema import AgentState
 from ask_candid.utils import html_format_docs_chat
@@ -29,7 +29,11 @@ logger = logging.getLogger(__name__)
 logger.setLevel(logging.INFO)
-def generate_with_context(state: AgentState, llm: LLM) -> AgentState:
     """Generate answer.
     Parameters
@@ -37,6 +41,8 @@ def generate_with_context(state: AgentState, llm: LLM) -> AgentState:
     state : AgentState
         The current state
     llm : LLM
     Returns
     -------
@@ -45,14 +51,20 @@ def generate_with_context(state: AgentState, llm: LLM) -> AgentState:
     """
     logger.info("---GENERATE ANSWER---")
     messages = state["messages"]
     question = state["user_input"]
     last_message = messages[-1]
     sources_str = last_message.content
-    sources_list = last_message.artifact  # cannot use directly as list of Documents
-    # converting to html string
     sources_html = html_format_docs_chat(sources_list)
     if sources_list:
         logger.info("---ADD SOURCES---")
     state["messages"].append(BaseMessage(content=sources_html, type="HTML"))
@@ -97,13 +109,13 @@ def add_recommendations_pipeline_(
     """
     # Nodes for recommendation functionalities
-    G.add_node("detect_intent_with_llm", partial(detect_intent_with_llm, llm=llm))
-    G.add_node("determine_context", determine_context)
-    G.add_node("make_recommendation", make_recommendation)
     # Check for recommendation query first
     # Execute until reaching END if user asks for recommendation
-    G.add_edge(reformulation_node_name, "detect_intent_with_llm")
     G.add_conditional_edges(
         source="detect_intent_with_llm",
         path=lambda state: "determine_context" if state["intent"] in ["rfp", "funder"] else search_node_name,
@@ -112,24 +124,27 @@ def add_recommendations_pipeline_(
             search_node_name: search_node_name
         },
     )
-    G.add_edge("determine_context", "make_recommendation")
-    G.add_edge("make_recommendation", END)
 def build_compute_graph(
     llm: LLM,
-    indices: List[str],
-    enable_recommendations: bool = False
 ) -> StateGraph:
     """Execution graph builder, the output is the execution flow for an interaction with the assistant.
     Parameters
     ----------
     llm : LLM
-    indices : List[str]
         Semantic index names to search over
     enable_recommendations : bool, optional
         Set to `True` to allow the flow to generate recommendations based on context, by default False
     Returns
     -------
@@ -137,25 +152,35 @@ def build_compute_graph(
         Execution graph
     """
-    candid_retriever_tool = retriever_tool(indices=indices)
     retrieve = ToolNode([candid_retriever_tool])
     tools = [candid_retriever_tool]
     G = StateGraph(AgentState)
-    G.add_node("reformulate", partial(reformulate_question_using_history, llm=llm, focus_on_recommendations=enable_recommendations))
-    G.add_node("search_agent", partial(search_agent, llm=llm, tools=tools))
-    G.add_node("retrieve", retrieve)
-    G.add_node("generate_with_context", partial(generate_with_context, llm=llm))
-    G.add_node("has_org_name", partial(has_org_name, llm=llm))
-    G.add_node("insert_org_link", insert_org_link)
     if enable_recommendations:
-        add_recommendations_pipeline_(G, llm=llm, reformulation_node_name="reformulate", search_node_name="search_agent")
     else:
-        G.add_edge("reformulate", "search_agent")
-    G.add_edge(START, "reformulate")
     G.add_conditional_edges(
         source="search_agent",
         path=tools_condition,
@@ -164,8 +189,8 @@ def build_compute_graph(
             END: "has_org_name",
         },
     )
-    G.add_edge("retrieve", "generate_with_context")
-    G.add_edge("generate_with_context", "has_org_name")
     G.add_conditional_edges(
         source="has_org_name",
         path=lambda x: x["next"],  # Now we're accessing the 'next' key from the dict
@@ -174,5 +199,5 @@ def build_compute_graph(
             END: END
         },
     )
-    G.add_edge("insert_org_link", END)
     return G

+from typing import List, Optional, Callable, Any
 from functools import partial
 import logging
 from langgraph.graph.state import StateGraph
 from langgraph.constants import START, END
 from ask_candid.tools.recommendation import (
     detect_intent_with_llm,
     determine_context,
 )
 from ask_candid.tools.question_reformulation import reformulate_question_using_history
 from ask_candid.tools.org_seach import has_org_name, insert_org_link
+from ask_candid.tools.search import search_agent, retriever_tool
 from ask_candid.agents.schema import AgentState
+from ask_candid.base.config.data import DataIndices
 from ask_candid.utils import html_format_docs_chat
 logger.setLevel(logging.INFO)
+def generate_with_context(
+    state: AgentState,
+    llm: LLM,
+    user_callback: Optional[Callable[[str], Any]] = None
+) -> AgentState:
     """Generate answer.
     Parameters
     state : AgentState
         The current state
     llm : LLM
+    user_callback : Optional[Callable[[str], Any]], optional
+        Optional UI callback to inform the user of apps states, by default None
     Returns
     -------
     """
     logger.info("---GENERATE ANSWER---")
+    if user_callback is not None:
+        try:
+            user_callback("Writing a response...")
+        except Exception as ex:
+            logger.warning("User callback was passed in but failed: %s", ex)
     messages = state["messages"]
     question = state["user_input"]
     last_message = messages[-1]
     sources_str = last_message.content
+    sources_list = last_message.artifact
     sources_html = html_format_docs_chat(sources_list)
     if sources_list:
         logger.info("---ADD SOURCES---")
     state["messages"].append(BaseMessage(content=sources_html, type="HTML"))
     """
     # Nodes for recommendation functionalities
+    G.add_node(node="detect_intent_with_llm", action=partial(detect_intent_with_llm, llm=llm))
+    G.add_node(node="determine_context", action=determine_context)
+    G.add_node(node="make_recommendation", action=make_recommendation)
     # Check for recommendation query first
     # Execute until reaching END if user asks for recommendation
+    G.add_edge(start_key=reformulation_node_name, end_key="detect_intent_with_llm")
     G.add_conditional_edges(
         source="detect_intent_with_llm",
         path=lambda state: "determine_context" if state["intent"] in ["rfp", "funder"] else search_node_name,
             search_node_name: search_node_name
         },
     )
+    G.add_edge(start_key="determine_context", end_key="make_recommendation")
+    G.add_edge(start_key="make_recommendation", end_key=END)
 def build_compute_graph(
     llm: LLM,
+    indices: List[DataIndices],
+    enable_recommendations: bool = False,
+    user_callback: Optional[Callable[[str], Any]] = None
 ) -> StateGraph:
     """Execution graph builder, the output is the execution flow for an interaction with the assistant.
     Parameters
     ----------
     llm : LLM
+    indices : List[DataIndices]
         Semantic index names to search over
     enable_recommendations : bool, optional
         Set to `True` to allow the flow to generate recommendations based on context, by default False
+    user_callback : Optional[Callable[[str], Any]], optional
+        Optional UI callback to inform the user of apps states, by default None
     Returns
     -------
         Execution graph
     """
+    candid_retriever_tool = retriever_tool(indices=indices, user_callback=user_callback)
     retrieve = ToolNode([candid_retriever_tool])
     tools = [candid_retriever_tool]
     G = StateGraph(AgentState)
+    G.add_node(
+        node="reformulate",
+        action=partial(reformulate_question_using_history, llm=llm, focus_on_recommendations=enable_recommendations)
+    )
+    G.add_node(node="search_agent", action=partial(search_agent, llm=llm, tools=tools))
+    G.add_node(node="retrieve", action=retrieve)
+    G.add_node(
+        node="generate_with_context",
+        action=partial(generate_with_context, llm=llm, user_callback=user_callback)
+    )
+    G.add_node(node="has_org_name", action=partial(has_org_name, llm=llm, user_callback=user_callback))
+    G.add_node(node="insert_org_link", action=insert_org_link)
     if enable_recommendations:
+        add_recommendations_pipeline_(
+            G, llm=llm,
+            reformulation_node_name="reformulate",
+            search_node_name="search_agent"
+        )
     else:
+        G.add_edge(start_key="reformulate", end_key="search_agent")
+    G.add_edge(start_key=START, end_key="reformulate")
     G.add_conditional_edges(
         source="search_agent",
         path=tools_condition,
             END: "has_org_name",
         },
     )
+    G.add_edge(start_key="retrieve", end_key="generate_with_context")
+    G.add_edge(start_key="generate_with_context", end_key="has_org_name")
     G.add_conditional_edges(
         source="has_org_name",
         path=lambda x: x["next"],  # Now we're accessing the 'next' key from the dict
             END: END
         },
     )
+    G.add_edge(start_key="insert_org_link", end_key=END)
     return G

ask_candid/retrieval/elastic.py CHANGED Viewed

@@ -1,20 +1,24 @@
 from typing import List, Tuple, Dict, Iterable, Iterator, Optional, Union, Any
 from dataclasses import dataclass
-from functools import partial
 from itertools import groupby
 from torch.nn import functional as F
 from pydantic import BaseModel, Field
 from langchain_core.documents import Document
-from langchain_core.tools import Tool
 from elasticsearch import Elasticsearch
 from ask_candid.retrieval.sparse_lexical import SpladeEncoder
 from ask_candid.services.small_lm import CandidSLM
 from ask_candid.base.config.connections import SEMANTIC_ELASTIC_QA, NEWS_ELASTIC
-from ask_candid.base.config.data import ElasticIndexMapping, ALL_INDICES
 encoder = SpladeEncoder()
@@ -82,6 +86,18 @@ def build_sparse_vector_query(
 def news_query_builder(query: str) -> Dict[str, Any]:
     tokens = encoder.token_expand(query)
     query = {
@@ -103,81 +119,70 @@ def news_query_builder(query: str) -> Dict[str, Any]:
             query["query"]["bool"]["should"].append({
                 "multi_match": {
                     "query": token,
-                    "fields": ["title", "content"],
                     "boost": score
                 }
             })
     return query
-def query_builder(query: str, indices: List[str]) -> List[Dict[str, Any]]:
     """Builds Elasticsearch multi-search query payload
     Parameters
     ----------
     query : str
         Search context string
-    indices : List[str]
         Semantic index names to search over
     Returns
     -------
-    List[Dict[str, Any]]
     """
-    queries = []
     if indices is None:
         indices = list(ALL_INDICES)
     for index in indices:
         if index == "issuelab":
-            q = build_sparse_vector_query(
-                query=query,
-                fields=("description", "content", "combined_issuelab_findings", "combined_item_description")
-            )
             q["_source"] = {"excludes": ["embeddings"]}
             q["size"] = 1
-            queries.extend([{"index": ElasticIndexMapping.ISSUELAB_INDEX_ELSER}, q])
         elif index == "youtube":
-            q = build_sparse_vector_query(
-                query=query,
-                fields=("captions_cleaned", "description_cleaned", "title")
-            )
-            # text_cleaned duplicates captions_cleaned
-            q["_source"] = {"excludes": ["embeddings", "captions", "description", "text_cleaned"]}
             q["size"] = 2
-            queries.extend([{"index": ElasticIndexMapping.YOUTUBE_INDEX_ELSER}, q])
         elif index == "candid_blog":
-            q = build_sparse_vector_query(
-                query=query,
-                fields=("content", "authors_text", "title_summary_tags")
-            )
             q["_source"] = {"excludes": ["embeddings"]}
             q["size"] = 2
-            queries.extend([{"index": ElasticIndexMapping.CANDID_BLOG_INDEX_ELSER}, q])
         elif index == "candid_learning":
-            q = build_sparse_vector_query(
-                query=query,
-                fields=("content", "title", "training_topics", "staff_recommendations")
-            )
             q["_source"] = {"excludes": ["embeddings"]}
             q["size"] = 2
-            queries.extend([{"index": ElasticIndexMapping.CANDID_LEARNING_INDEX_ELSER}, q])
         elif index == "candid_help":
-            q = build_sparse_vector_query(
-                query=query,
-                fields=("content", "combined_article_description")
-            )
             q["_source"] = {"excludes": ["embeddings"]}
             q["size"] = 2
-            queries.extend([{"index": ElasticIndexMapping.CANDID_HELP_INDEX_ELSER}, q])
-    return queries
 def multi_search(
     queries: List[Dict[str, Any]],
-    news_query: Optional[Dict[str, Any]] = None
 ) -> List[ElasticHitsResult]:
     """Runs multi-search query
@@ -191,6 +196,17 @@ def multi_search(
     List[ElasticHitsResult]
     """
     results = []
     if len(queries) > 0:
@@ -200,31 +216,16 @@ def multi_search(
             verify_certs=False,
             request_timeout=60 * 3
         ) as es:
-            for query_group in es.msearch(body=queries).get("responses", []):
-                for hit in query_group.get("hits", {}).get("hits", []):
-                    hit = ElasticHitsResult(
-                        index=hit["_index"],
-                        id=hit["_id"],
-                        score=hit["_score"],
-                        source=hit["_source"],
-                        inner_hits=hit.get("inner_hits", {})
-                    )
-                    results.append(hit)
-    if news_query is not None:
         with Elasticsearch(
             NEWS_ELASTIC.url,
             http_auth=(NEWS_ELASTIC.username, NEWS_ELASTIC.password),
             timeout=60
         ) as es:
-            for hit in es.search(body=news_query, index="news_1").get("hits", {}).get("hits") or []:
-                hit = ElasticHitsResult(
-                    index=hit["_index"],
-                    id=hit["_id"],
-                    score=hit["_score"],
-                    source=hit["_source"],
-                    inner_hits=hit.get("inner_hits", {})
-                )
                 results.append(hit)
     return results
@@ -244,9 +245,8 @@ def get_query_results(search_text: str, indices: Optional[List[str]] = None) ->
     List[ElasticHitsResult]
     """
-    queries = query_builder(query=search_text, indices=indices)
-    news_q = news_query_builder(query=search_text)
-    return multi_search(queries, news_query=news_q)
 def retrieved_text(hits: Dict[str, Any]) -> str:
@@ -335,36 +335,6 @@ def reranker(
     yield from sorted(results, key=lambda x: x.score, reverse=True)
-def get_results(user_input: str, indices: List[str]) -> Tuple[str, List[Document]]:
-    """End-to-end search and re-rank function.
-    Parameters
-    ----------
-    user_input : str
-        Search context string
-    indices : List[str]
-        Semantic index names to search over
-    Returns
-    -------
-    Tuple[str, List[Document]]
-        (concatenated text from search results, documents list)
-    """
-    output = ["Search didn't return any Candid sources"]
-    page_content = []
-    content = "Search didn't return any Candid sources"
-    results = get_query_results(search_text=user_input, indices=indices)
-    if results:
-        output = get_reranked_results(results, search_text=user_input)
-        for doc in output:
-            page_content.append(doc.page_content)
-        content = "\n\n".join(page_content)
-    # for the tool we need to return a tuple for content_and_artifact type
-    return content, output
 def get_context(field_name: str, hit: ElasticHitsResult, context_length: int = 1024, add_context: bool = True) -> str:
     """Pads the relevant chunk of text with context before and after
@@ -537,30 +507,3 @@ def get_reranked_results(results: List[ElasticHitsResult], search_text: Optional
         if hit is not None:
             output.append(hit)
     return output
-def retriever_tool(indices: List[str]) -> Tool:
-    """Tool component for use in conditional edge building for RAG execution graph.
-    Cannot use `create_retriever_tool` because it only provides content losing all metadata on the way
-    https://python.langchain.com/docs/how_to/custom_tools/#returning-artifacts-of-tool-execution
-    Parameters
-    ----------
-    indices : List[str]
-        Semantic index names to search over
-    Returns
-    -------
-    Tool
-    """
-    return Tool(
-        name="retrieve_social_sector_information",
-        func=partial(get_results, indices=indices),
-        description=(
-            "Return additional information about social and philanthropic sector, "
-            "including nonprofits (NGO), grants, foundations, funding, RFP, LOI, Candid."
-        ),
-        args_schema=RetrieverInput,
-        response_format="content_and_artifact"
-    )

 from typing import List, Tuple, Dict, Iterable, Iterator, Optional, Union, Any
 from dataclasses import dataclass
 from itertools import groupby
 from torch.nn import functional as F
 from pydantic import BaseModel, Field
 from langchain_core.documents import Document
 from elasticsearch import Elasticsearch
 from ask_candid.retrieval.sparse_lexical import SpladeEncoder
+from ask_candid.retrieval.sources.issuelab import IssueLabConfig
+from ask_candid.retrieval.sources.youtube import YoutubeConfig
+from ask_candid.retrieval.sources.candid_blog import CandidBlogConfig
+from ask_candid.retrieval.sources.candid_learning import CandidLearningConfig
+from ask_candid.retrieval.sources.candid_help import CandidHelpConfig
+from ask_candid.retrieval.sources.candid_news import CandidNewsConfig
 from ask_candid.services.small_lm import CandidSLM
 from ask_candid.base.config.connections import SEMANTIC_ELASTIC_QA, NEWS_ELASTIC
+from ask_candid.base.config.data import DataIndices, ALL_INDICES
 encoder = SpladeEncoder()
 def news_query_builder(query: str) -> Dict[str, Any]:
+    """Builds a valid Elasticsearch query against Candid news, simulating a token expansion.
+    Parameters
+    ----------
+    query : str
+        Search context string
+    Returns
+    -------
+    Dict[str, Any]
+    """
     tokens = encoder.token_expand(query)
     query = {
             query["query"]["bool"]["should"].append({
                 "multi_match": {
                     "query": token,
+                    "fields": CandidNewsConfig.text_fields,
                     "boost": score
                 }
             })
     return query
+def query_builder(query: str, indices: List[DataIndices]) -> Tuple[List[Dict[str, Any]], List[Dict[str, Any]]]:
     """Builds Elasticsearch multi-search query payload
     Parameters
     ----------
     query : str
         Search context string
+    indices : List[DataIndices]
         Semantic index names to search over
     Returns
     -------
+    Tuple[List[Dict[str, Any]], List[Dict[str, Any]]]
+        (semantic index queries, news queries)
     """
+    queries, news_queries = [], []
     if indices is None:
         indices = list(ALL_INDICES)
     for index in indices:
         if index == "issuelab":
+            q = build_sparse_vector_query(query=query, fields=IssueLabConfig.text_fields)
             q["_source"] = {"excludes": ["embeddings"]}
             q["size"] = 1
+            queries.extend([{"index": IssueLabConfig.index_name}, q])
         elif index == "youtube":
+            q = build_sparse_vector_query(query=query, fields=YoutubeConfig.text_fields)
+            q["_source"] = {"excludes": ["embeddings", *YoutubeConfig.excluded_fields]}
             q["size"] = 2
+            queries.extend([{"index": YoutubeConfig.index_name}, q])
         elif index == "candid_blog":
+            q = build_sparse_vector_query(query=query, fields=CandidBlogConfig.text_fields)
             q["_source"] = {"excludes": ["embeddings"]}
             q["size"] = 2
+            queries.extend([{"index": CandidBlogConfig.index_name}, q])
         elif index == "candid_learning":
+            q = build_sparse_vector_query(query=query, fields=CandidLearningConfig.text_fields)
             q["_source"] = {"excludes": ["embeddings"]}
             q["size"] = 2
+            queries.extend([{"index": CandidLearningConfig.index_name}, q])
         elif index == "candid_help":
+            q = build_sparse_vector_query(query=query, fields=CandidHelpConfig.text_fields)
             q["_source"] = {"excludes": ["embeddings"]}
             q["size"] = 2
+            queries.extend([{"index": CandidHelpConfig.index_name}, q])
+        elif index == "news":
+            q = news_query_builder(query=query)
+            q["size"] = 5
+            news_queries.extend([{"index": CandidNewsConfig.index_name}, q])
+    return queries, news_queries
 def multi_search(
     queries: List[Dict[str, Any]],
+    news_queries: Optional[List[Dict[str, Any]]] = None
 ) -> List[ElasticHitsResult]:
     """Runs multi-search query
     List[ElasticHitsResult]
     """
+    def _msearch_response_generator(responses: List[Dict[str, Any]]) -> Iterator[ElasticHitsResult]:
+        for query_group in responses:
+            for h in query_group.get("hits", {}).get("hits", []):
+                yield ElasticHitsResult(
+                    index=h["_index"],
+                    id=h["_id"],
+                    score=h["_score"],
+                    source=h["_source"],
+                    inner_hits=h.get("inner_hits", {})
+                )
     results = []
     if len(queries) > 0:
             verify_certs=False,
             request_timeout=60 * 3
         ) as es:
+            for hit in _msearch_response_generator(es.msearch(body=queries).get("responses", [])):
+                results.append(hit)
+    if news_queries is not None and len(news_queries):
         with Elasticsearch(
             NEWS_ELASTIC.url,
             http_auth=(NEWS_ELASTIC.username, NEWS_ELASTIC.password),
             timeout=60
         ) as es:
+            for hit in _msearch_response_generator(es.msearch(body=news_queries).get("responses", [])):
                 results.append(hit)
     return results
     List[ElasticHitsResult]
     """
+    queries, news_q = query_builder(query=search_text, indices=indices)
+    return multi_search(queries, news_queries=news_q)
 def retrieved_text(hits: Dict[str, Any]) -> str:
     yield from sorted(results, key=lambda x: x.score, reverse=True)
 def get_context(field_name: str, hit: ElasticHitsResult, context_length: int = 1024, add_context: bool = True) -> str:
     """Pads the relevant chunk of text with context before and after
         if hit is not None:
             output.append(hit)
     return output

ask_candid/retrieval/sources/candid_blog.py CHANGED Viewed

@@ -1,4 +1,11 @@
 from typing import Dict, Any
 def build_card_html(doc: Dict[str, Any], height_px: int = 200, show_chunks=False) -> str:

 from typing import Dict, Any
+from ask_candid.retrieval.sources.schema import ElasticSourceConfig
+CandidBlogConfig = ElasticSourceConfig(
+    index_name="search-semantic-candid-blog",
+    text_fields=("content", "authors_text", "title_summary_tags")
+)
 def build_card_html(doc: Dict[str, Any], height_px: int = 200, show_chunks=False) -> str:

ask_candid/retrieval/sources/candid_help.py CHANGED Viewed

@@ -1,4 +1,11 @@
 from typing import Dict, Any
 def build_card_html(doc: Dict[str, Any], height_px: int = 200, show_chunks=False) -> str:

 from typing import Dict, Any
+from ask_candid.retrieval.sources.schema import ElasticSourceConfig
+CandidHelpConfig = ElasticSourceConfig(
+    index_name="search-semantic-candid-help-elser_ve1",
+    text_fields=("content", "combined_article_description")
+)
 def build_card_html(doc: Dict[str, Any], height_px: int = 200, show_chunks=False) -> str:

ask_candid/retrieval/sources/candid_learning.py CHANGED Viewed

@@ -1,4 +1,11 @@
 from typing import Dict, Any
 def build_card_html(doc: Dict[str, Any], height_px: int = 200, show_chunks=False) -> str:

 from typing import Dict, Any
+from ask_candid.retrieval.sources.schema import ElasticSourceConfig
+CandidLearningConfig = ElasticSourceConfig(
+    index_name="search-semantic-candid-learning_ve1",
+    text_fields=("content", "title", "training_topics", "staff_recommendations")
+)
 def build_card_html(doc: Dict[str, Any], height_px: int = 200, show_chunks=False) -> str:

ask_candid/retrieval/sources/candid_news.py ADDED Viewed

	@@ -0,0 +1,7 @@

+from ask_candid.retrieval.sources.schema import ElasticSourceConfig
+CandidNewsConfig = ElasticSourceConfig(
+    index_name="news_1",
+    text_fields=("title", "content")
+)

ask_candid/retrieval/sources/issuelab.py CHANGED Viewed

@@ -1,4 +1,11 @@
 from typing import Dict, Any
 def issuelab_card_html(doc: Dict[str, Any], height_px: int = 200, show_chunks=False) -> str:

 from typing import Dict, Any
+from ask_candid.retrieval.sources.schema import ElasticSourceConfig
+IssueLabConfig = ElasticSourceConfig(
+    index_name="search-semantic-issuelab-elser_ve2",
+    text_fields=("description", "content", "combined_issuelab_findings", "combined_item_description")
+)
 def issuelab_card_html(doc: Dict[str, Any], height_px: int = 200, show_chunks=False) -> str:

ask_candid/retrieval/sources/schema.py ADDED Viewed

	@@ -0,0 +1,9 @@

+from typing import Tuple, Optional
+from dataclasses import dataclass, field
+@dataclass
+class ElasticSourceConfig:
+    index_name: str
+    text_fields: Tuple[str]
+    excluded_fields: Optional[Tuple[str]] = field(default_factory=tuple)

ask_candid/retrieval/sources/youtube.py CHANGED Viewed

@@ -1,4 +1,12 @@
 from typing import Dict, Any
 def build_card_html(doc: Dict[str, Any], height_px: int = 200, show_chunks=False) -> str:

 from typing import Dict, Any
+from ask_candid.retrieval.sources.schema import ElasticSourceConfig
+YoutubeConfig = ElasticSourceConfig(
+    index_name="search-semantic-youtube-elser_ve1",
+    text_fields=("captions_cleaned", "description_cleaned", "title"),
+    excluded_fields=("captions", "description", "text_cleaned")
+)
 def build_card_html(doc: Dict[str, Any], height_px: int = 200, show_chunks=False) -> str:

ask_candid/tools/elastic/list_indices_tool.py CHANGED Viewed

@@ -31,7 +31,8 @@ class ListIndicesTool(BaseTool):
     name: str = "elastic_list_indices"  # Added type annotation
     description: str = (
-        "Input is a delimiter like comma or new line. Output is a separated list of indices in the database. Always use this tool to get to know the indices in the ElasticSearch cluster."
     )
     args_schema: Optional[Type[BaseModel]] = (
         ListIndicesInput  # Define this before methods

     name: str = "elastic_list_indices"  # Added type annotation
     description: str = (
+        "Input is a delimiter like comma or new line. Output is a separated list of indices in the database. "
+        "Always use this tool to get to know the indices in the ElasticSearch cluster."
     )
     args_schema: Optional[Type[BaseModel]] = (
         ListIndicesInput  # Define this before methods

ask_candid/tools/org_seach.py CHANGED Viewed

@@ -1,11 +1,10 @@
-from typing import List
 import logging
 import re
 from thefuzz import fuzz
 from langchain.output_parsers.openai_tools import JsonOutputToolsParser
-# from langchain_openai.chat_models import ChatOpenAI
 from langchain_core.runnables import RunnableSequence
 from langchain_core.prompts import ChatPromptTemplate
 from langchain_core.language_models.llms import LLM
@@ -15,7 +14,6 @@ from pydantic import BaseModel, Field
 from ask_candid.agents.schema import AgentState
 from ask_candid.services.org_search import OrgSearch
-# from ask_candid.base.config.rest import OPENAI
 search = OrgSearch()
 logging.basicConfig(format="[%(levelname)s] (%(asctime)s) :: %(message)s")
@@ -59,7 +57,6 @@ def extract_org_links_from_chatbot(chatbot_output: str, llm: LLM):
     try:
         parser = JsonOutputToolsParser()
-        # llm = ChatOpenAI(model="gpt-4o", api_key=OPENAI["key"]).bind_tools([OrganizationNames])
         model = llm.bind_tools([OrganizationNames])
         prompt = ChatPromptTemplate.from_template(prompt)
         chain = RunnableSequence(prompt, model, parser)
@@ -203,17 +200,33 @@ def embed_org_links_in_text(input_text: str, org_link_dict: dict):
     return input_text
-def has_org_name(state: AgentState, llm: LLM) -> AgentState:
     """
-    Processes the latest message to extract organization links and determine the next step.
-    Args:
-        state (AgentState): The current state of the agent, including a list of messages.
-    Returns:
-        dict: A dictionary with the next agent action and, if available, a dictionary of organization links.
-    """
     logger.info("---HAS ORG NAMES?---")
     messages = state["messages"]
     last_message = messages[-1].content
     output_list = extract_org_links_from_chatbot(last_message, llm=llm)

+from typing import List, Optional, Callable, Any
 import logging
 import re
 from thefuzz import fuzz
 from langchain.output_parsers.openai_tools import JsonOutputToolsParser
 from langchain_core.runnables import RunnableSequence
 from langchain_core.prompts import ChatPromptTemplate
 from langchain_core.language_models.llms import LLM
 from ask_candid.agents.schema import AgentState
 from ask_candid.services.org_search import OrgSearch
 search = OrgSearch()
 logging.basicConfig(format="[%(levelname)s] (%(asctime)s) :: %(message)s")
     try:
         parser = JsonOutputToolsParser()
         model = llm.bind_tools([OrganizationNames])
         prompt = ChatPromptTemplate.from_template(prompt)
         chain = RunnableSequence(prompt, model, parser)
     return input_text
+def has_org_name(
+    state: AgentState,
+    llm: LLM,
+    user_callback: Optional[Callable[[str], Any]] = None
+) -> AgentState:
+    """Processes the latest message to extract organization links and determine the next step.
+    Parameters
+    ----------
+    state : AgentState
+        The current state of the agent, including a list of messages.
+    llm : LLM
+    user_callback : Optional[Callable[[str], Any]], optional
+        Optional UI callback to inform the user of apps states, by default None
+    Returns
+    -------
+    AgentState
     """
     logger.info("---HAS ORG NAMES?---")
+    if user_callback is not None:
+        try:
+            user_callback("Checking for relevant organizations")
+        except Exception as ex:
+            logger.warning("User callback was passed in but failed: %s", ex)
     messages = state["messages"]
     last_message = messages[-1].content
     output_list = extract_org_links_from_chatbot(last_message, llm=llm)

ask_candid/tools/search.py CHANGED Viewed

@@ -1,9 +1,14 @@
-from typing import List
 import logging
 from langchain_core.language_models.llms import LLM
 from langchain_core.tools import Tool
 from ask_candid.agents.schema import AgentState
 logging.basicConfig(format="[%(levelname)s] (%(asctime)s) :: %(message)s")
@@ -11,7 +16,86 @@ logger = logging.getLogger(__name__)
 logger.setLevel(logging.INFO)
-def search_agent(state, llm: LLM, tools: List[Tool]) -> AgentState:
     """Invokes the agent model to generate a response based on the current state. Given
     the question, it will decide to retrieve using the retriever tool, or simply end.

+from typing import List, Tuple, Callable, Optional, Any
+from functools import partial
 import logging
+from pydantic import BaseModel, Field
 from langchain_core.language_models.llms import LLM
+from langchain_core.documents import Document
 from langchain_core.tools import Tool
+from ask_candid.retrieval.elastic import get_query_results, get_reranked_results
+from ask_candid.base.config.data import DataIndices
 from ask_candid.agents.schema import AgentState
 logging.basicConfig(format="[%(levelname)s] (%(asctime)s) :: %(message)s")
 logger.setLevel(logging.INFO)
+class RetrieverInput(BaseModel):
+    """Input to the Elasticsearch retriever."""
+    user_input: str = Field(description="query to look up in retriever")
+def get_search_results(
+    user_input: str,
+    indices: List[DataIndices],
+    user_callback: Optional[Callable[[str], Any]] = None
+) -> Tuple[str, List[Document]]:
+    """End-to-end search and re-rank function.
+    Parameters
+    ----------
+    user_input : str
+        Search context string
+    indices : List[DataIndices]
+        Semantic index names to search over
+    user_callback : Optional[Callable[[str], Any]], optional
+        Optional UI callback to inform the user of apps states, by default None
+    Returns
+    -------
+    Tuple[str, List[Document]]
+        (concatenated text from search results, documents list)
+    """
+    if user_callback is not None:
+        try:
+            user_callback("Searching for relevant information")
+        except Exception as ex:
+            logger.warning("User callback was passed in but failed: %s", ex)
+    output = ["Search didn't return any Candid sources"]
+    page_content = []
+    content = "Search didn't return any Candid sources"
+    results = get_query_results(search_text=user_input, indices=indices)
+    if results:
+        output = get_reranked_results(results, search_text=user_input)
+        for doc in output:
+            page_content.append(doc.page_content)
+        content = "\n\n".join(page_content)
+    # for the tool we need to return a tuple for content_and_artifact type
+    return content, output
+def retriever_tool(
+    indices: List[DataIndices],
+    user_callback: Optional[Callable[[str], Any]] = None
+) -> Tool:
+    """Tool component for use in conditional edge building for RAG execution graph.
+    Cannot use `create_retriever_tool` because it only provides content losing all metadata on the way
+    https://python.langchain.com/docs/how_to/custom_tools/#returning-artifacts-of-tool-execution
+    Parameters
+    ----------
+    indices : List[DataIndices]
+        Semantic index names to search over
+    user_callback : Optional[Callable[[str], Any]], optional
+        Optional UI callback to inform the user of apps states, by default None
+    Returns
+    -------
+    Tool
+    """
+    return Tool(
+        name="retrieve_social_sector_information",
+        func=partial(get_search_results, indices=indices, user_callback=user_callback),
+        description=(
+            "Return additional information about social and philanthropic sector, "
+            "including nonprofits (NGO), grants, foundations, funding, RFP, LOI, Candid."
+        ),
+        args_schema=RetrieverInput,
+        response_format="content_and_artifact"
+    )
+def search_agent(state: AgentState, llm: LLM, tools: List[Tool]) -> AgentState:
     """Invokes the agent model to generate a response based on the current state. Given
     the question, it will decide to retrieve using the retriever tool, or simply end.

ask_candid/utils.py CHANGED Viewed

@@ -77,7 +77,7 @@ def format_chat_ag_response(chatbot: List[Any]) -> List[Any]:
     """
     sources = ""
     if chatbot:
-        title = chatbot[-1]["metadata"].get("title", None)
         if title == "Sources HTML":
             sources = chatbot[-1]["content"]
             chatbot.pop(-1)

     """
     sources = ""
     if chatbot:
+        title = (chatbot[-1].get("metadata") or {}).get("title", None)
         if title == "Sources HTML":
             sources = chatbot[-1]["content"]
             chatbot.pop(-1)