| """ |
| Quick timing probe for the tool agent. |
| |
| For each case it prints: |
| - the wall-clock time the case STARTED (separate column) |
| - how long the agent took (seconds) |
| - the FIRST tool the agent called |
| - the full tool sequence |
| |
| Runs a small slice of the dataset (default 5) so it's fast to eyeball. |
| Override the count with: python -m evaluation.time_agent 10 |
| """ |
| import json |
| import sys |
| import time |
| import warnings |
| from datetime import datetime |
|
|
| from dotenv import load_dotenv |
|
|
| load_dotenv() |
|
|
| warnings.filterwarnings("ignore", category=DeprecationWarning) |
|
|
| from agents.tool_agent import run_tool_agent |
| from .eval_common import ( |
| build_embeddings, |
| build_llm, |
| build_reranker, |
| load_dataset, |
| ) |
|
|
| RESULTS_PATH = "evaluation/data/time_agent_results.json" |
|
|
|
|
| def main(): |
| |
| |
| n = int(sys.argv[1]) if len(sys.argv) > 1 else 5 |
| offset = int(sys.argv[2]) if len(sys.argv) > 2 else 0 |
|
|
| print("Initializing models...") |
| llm = build_llm() |
| embeddings = build_embeddings() |
| reranker = build_reranker() |
|
|
| dataset = load_dataset()[offset:offset + n] |
| print(f"\nTiming {len(dataset)} cases (offset {offset})...\n") |
|
|
| col_q = 50 |
| print(f"{'Started at':<12} {'Secs':>6} {'First tool':<16} {'Question':<{col_q}} Tools") |
| print("-" * 130) |
|
|
| results = [] |
|
|
| for case in dataset: |
| question = case["question"] |
| reference = case.get("ground_truth", "") |
| expected = case.get("expected_behavior", "answer") |
|
|
| start_clock = datetime.now().strftime("%H:%M:%S") |
| start = time.perf_counter() |
|
|
| trace: list = [] |
| error = None |
| answer = "" |
| try: |
| answer = run_tool_agent( |
| question, |
| llm=llm, |
| embeddings=embeddings, |
| reranker=reranker, |
| trace=trace, |
| ) |
| except Exception as exc: |
| error = f"{type(exc).__name__}: {exc}" |
|
|
| elapsed = time.perf_counter() - start |
| tool_sequence = [step["tool"] for step in trace if "tool" in step] |
| first_tool = tool_sequence[0] if tool_sequence else "(none)" |
|
|
| results.append({ |
| "question": question, |
| "ground_truth": reference, |
| "expected_behavior": expected, |
| "answer": answer, |
| "error": error, |
| "elapsed_sec": round(elapsed, 1), |
| "tools": tool_sequence, |
| }) |
|
|
| tail = error if error else "" |
| print( |
| f"{start_clock:<12} {elapsed:6.1f} {first_tool:<16} " |
| f"{question[:col_q]:<{col_q}} {tool_sequence} {tail}" |
| ) |
|
|
| |
| if offset > 0: |
| try: |
| with open(RESULTS_PATH, "r", encoding="utf-8") as f: |
| existing = json.load(f) |
| except FileNotFoundError: |
| existing = [] |
| results = existing + results |
|
|
| with open(RESULTS_PATH, "w", encoding="utf-8") as f: |
| json.dump(results, f, ensure_ascii=False, indent=2) |
|
|
| print("-" * 130) |
| print(f"Wrote {len(results)} total results (with answers) to {RESULTS_PATH}") |
|
|
|
|
| if __name__ == "__main__": |
| main() |
|
|