""" End-to-end evaluation of the tool agent. Checks whether the agent: - routes tool calls at all (rather than answering from memory) - calls query_documents before web_search (local-RAG-first routing) - finishes within the iteration limit - refuses to answer unanswerable questions rather than fabricating a response No LLM judge is used. All checks are deterministic heuristics so results are reproducible without API credits. """ import warnings from dotenv import load_dotenv load_dotenv() warnings.filterwarnings("ignore", category=DeprecationWarning) from agents.tool_agent import run_tool_agent from .eval_common import ( build_embeddings, build_llm, build_reranker, load_dataset, ) ITERATION_LIMIT_MESSAGE = "Agent reached the iteration limit without a final answer." # Phrases that indicate the agent acknowledged it could not find the answer. _REFUSAL_PHRASES = [ "don't have", "do not have", "cannot find", "not available", "no information", "not mentioned", "not provided", "not contain", "does not contain", "doesn't contain", "doesn't provide", "does not provide", "unable to find", "i don't know", ] def _fallback_handled(answer: str) -> bool: """ Return True when the agent appears to have refused rather than fabricated. Heuristic: the answer contains at least one refusal/can't-find signal phrase. Domain-agnostic — works for any question type, not just salary or compensation queries. """ answer_lower = answer.lower() return any(phrase in answer_lower for phrase in _REFUSAL_PHRASES) def main(): print("Initializing models...") llm = build_llm() embeddings = build_embeddings() reranker = build_reranker() dataset = load_dataset() print(f"\nRunning tool agent on {len(dataset)} cases...\n") col_q = 60 print(f"{'Question':<{col_q}} {'Tools':<32} Fin Fallback?") print("-" * 110) total_called_a_tool = 0 total_used_local_first = 0 local_first_applicable = 0 # cases where query_documents appeared at all total_finished = 0 total_errors = 0 fallback_cases_total = 0 fallback_cases_handled = 0 for case in dataset: question = case["question"] expected_behavior = case.get("expected_behavior", "answer") # Run the agent. A tool may raise (e.g. a failed web fetch); we record # the case as an error and keep going rather than aborting the whole run. trace: list = [] try: answer = run_tool_agent( question, llm=llm, embeddings=embeddings, reranker=reranker, trace=trace, ) except Exception as exc: total_errors += 1 tool_sequence = [step["tool"] for step in trace if "tool" in step] print( f"{question[:col_q]:<{col_q}} {str(tool_sequence)[:32]:<32} " f"ERR {type(exc).__name__}: {str(exc)[:60]}" ) continue tool_sequence = [step["tool"] for step in trace if "tool" in step] called_a_tool = len(tool_sequence) > 0 finished = answer != ITERATION_LIMIT_MESSAGE # Local-first: only meaningful when query_documents appears in the sequence. used_local_first = False if "query_documents" in tool_sequence: local_first_applicable += 1 used_local_first = tool_sequence[0] == "query_documents" if used_local_first: total_used_local_first += 1 if called_a_tool: total_called_a_tool += 1 if finished: total_finished += 1 fallback_label = "" if expected_behavior == "fallback": fallback_cases_total += 1 handled = _fallback_handled(answer) if handled: fallback_cases_handled += 1 fallback_label = "OK" if handled else "FAIL" tool_str = str(tool_sequence)[:32] fin_label = "yes" if finished else "NO" print( f"{question[:col_q]:<{col_q}} {tool_str:<32} {fin_label:<3} {fallback_label}" ) n = len(dataset) print() print("=" * 60) print(f"Agent evaluation summary ({n} cases)") print("=" * 60) print(f" Called a tool: {total_called_a_tool}/{n}") print(f" Used local RAG first: {total_used_local_first}/{local_first_applicable}" f" (of cases that called query_documents)") print(f" Finished within limit: {total_finished}/{n}") if fallback_cases_total: print(f" Fallback handled: {fallback_cases_handled}/{fallback_cases_total}" f" (refused with a can't-find signal phrase)") if __name__ == "__main__": main()