Rabbook / evaluation /time_agent.py
Matcry's picture
Deploy snapshot
c76423f
Raw
History Blame Contribute Delete
3.31 kB
"""
Quick timing probe for the tool agent.
For each case it prints:
- the wall-clock time the case STARTED (separate column)
- how long the agent took (seconds)
- the FIRST tool the agent called
- the full tool sequence
Runs a small slice of the dataset (default 5) so it's fast to eyeball.
Override the count with: python -m evaluation.time_agent 10
"""
import json
import sys
import time
import warnings
from datetime import datetime
from dotenv import load_dotenv
load_dotenv()
warnings.filterwarnings("ignore", category=DeprecationWarning)
from agents.tool_agent import run_tool_agent
from .eval_common import (
build_embeddings,
build_llm,
build_reranker,
load_dataset,
)
RESULTS_PATH = "evaluation/data/time_agent_results.json"
def main():
# Usage: python -m evaluation.time_agent [count] [offset]
# offset > 0 runs a later slice and APPENDS to the existing results file.
n = int(sys.argv[1]) if len(sys.argv) > 1 else 5
offset = int(sys.argv[2]) if len(sys.argv) > 2 else 0
print("Initializing models...")
llm = build_llm()
embeddings = build_embeddings()
reranker = build_reranker()
dataset = load_dataset()[offset:offset + n]
print(f"\nTiming {len(dataset)} cases (offset {offset})...\n")
col_q = 50
print(f"{'Started at':<12} {'Secs':>6} {'First tool':<16} {'Question':<{col_q}} Tools")
print("-" * 130)
results = []
for case in dataset:
question = case["question"]
reference = case.get("ground_truth", "")
expected = case.get("expected_behavior", "answer")
start_clock = datetime.now().strftime("%H:%M:%S")
start = time.perf_counter()
trace: list = []
error = None
answer = ""
try:
answer = run_tool_agent(
question,
llm=llm,
embeddings=embeddings,
reranker=reranker,
trace=trace,
)
except Exception as exc:
error = f"{type(exc).__name__}: {exc}"
elapsed = time.perf_counter() - start
tool_sequence = [step["tool"] for step in trace if "tool" in step]
first_tool = tool_sequence[0] if tool_sequence else "(none)"
results.append({
"question": question,
"ground_truth": reference,
"expected_behavior": expected,
"answer": answer,
"error": error,
"elapsed_sec": round(elapsed, 1),
"tools": tool_sequence,
})
tail = error if error else ""
print(
f"{start_clock:<12} {elapsed:6.1f} {first_tool:<16} "
f"{question[:col_q]:<{col_q}} {tool_sequence} {tail}"
)
# When running a later slice, append to whatever is already on disk.
if offset > 0:
try:
with open(RESULTS_PATH, "r", encoding="utf-8") as f:
existing = json.load(f)
except FileNotFoundError:
existing = []
results = existing + results
with open(RESULTS_PATH, "w", encoding="utf-8") as f:
json.dump(results, f, ensure_ascii=False, indent=2)
print("-" * 130)
print(f"Wrote {len(results)} total results (with answers) to {RESULTS_PATH}")
if __name__ == "__main__":
main()