""" Quick timing probe for the tool agent. For each case it prints: - the wall-clock time the case STARTED (separate column) - how long the agent took (seconds) - the FIRST tool the agent called - the full tool sequence Runs a small slice of the dataset (default 5) so it's fast to eyeball. Override the count with: python -m evaluation.time_agent 10 """ import json import sys import time import warnings from datetime import datetime from dotenv import load_dotenv load_dotenv() warnings.filterwarnings("ignore", category=DeprecationWarning) from agents.tool_agent import run_tool_agent from .eval_common import ( build_embeddings, build_llm, build_reranker, load_dataset, ) RESULTS_PATH = "evaluation/data/time_agent_results.json" def main(): # Usage: python -m evaluation.time_agent [count] [offset] # offset > 0 runs a later slice and APPENDS to the existing results file. n = int(sys.argv[1]) if len(sys.argv) > 1 else 5 offset = int(sys.argv[2]) if len(sys.argv) > 2 else 0 print("Initializing models...") llm = build_llm() embeddings = build_embeddings() reranker = build_reranker() dataset = load_dataset()[offset:offset + n] print(f"\nTiming {len(dataset)} cases (offset {offset})...\n") col_q = 50 print(f"{'Started at':<12} {'Secs':>6} {'First tool':<16} {'Question':<{col_q}} Tools") print("-" * 130) results = [] for case in dataset: question = case["question"] reference = case.get("ground_truth", "") expected = case.get("expected_behavior", "answer") start_clock = datetime.now().strftime("%H:%M:%S") start = time.perf_counter() trace: list = [] error = None answer = "" try: answer = run_tool_agent( question, llm=llm, embeddings=embeddings, reranker=reranker, trace=trace, ) except Exception as exc: error = f"{type(exc).__name__}: {exc}" elapsed = time.perf_counter() - start tool_sequence = [step["tool"] for step in trace if "tool" in step] first_tool = tool_sequence[0] if tool_sequence else "(none)" results.append({ "question": question, "ground_truth": reference, "expected_behavior": expected, "answer": answer, "error": error, "elapsed_sec": round(elapsed, 1), "tools": tool_sequence, }) tail = error if error else "" print( f"{start_clock:<12} {elapsed:6.1f} {first_tool:<16} " f"{question[:col_q]:<{col_q}} {tool_sequence} {tail}" ) # When running a later slice, append to whatever is already on disk. if offset > 0: try: with open(RESULTS_PATH, "r", encoding="utf-8") as f: existing = json.load(f) except FileNotFoundError: existing = [] results = existing + results with open(RESULTS_PATH, "w", encoding="utf-8") as f: json.dump(results, f, ensure_ascii=False, indent=2) print("-" * 130) print(f"Wrote {len(results)} total results (with answers) to {RESULTS_PATH}") if __name__ == "__main__": main()