Spaces:

Matcry
/

Rabbook

Running

App Files Files Community

Rabbook / evaluation /time_agent.py

Matcry

Deploy snapshot

c76423f 21 days ago

Raw

History Blame Contribute Delete

3.31 kB

	"""
	Quick timing probe for the tool agent.

	For each case it prints:
	- the wall-clock time the case STARTED (separate column)
	- how long the agent took (seconds)
	- the FIRST tool the agent called
	- the full tool sequence

	Runs a small slice of the dataset (default 5) so it's fast to eyeball.
	Override the count with: python -m evaluation.time_agent 10
	"""
	import json
	import sys
	import time
	import warnings
	from datetime import datetime

	from dotenv import load_dotenv

	load_dotenv()

	warnings.filterwarnings("ignore", category=DeprecationWarning)

	from agents.tool_agent import run_tool_agent
	from .eval_common import (
	build_embeddings,
	build_llm,
	build_reranker,
	load_dataset,
	)

	RESULTS_PATH = "evaluation/data/time_agent_results.json"


	def main():
	# Usage: python -m evaluation.time_agent [count] [offset]
	# offset > 0 runs a later slice and APPENDS to the existing results file.
	n = int(sys.argv[1]) if len(sys.argv) > 1 else 5
	offset = int(sys.argv[2]) if len(sys.argv) > 2 else 0

	print("Initializing models...")
	llm = build_llm()
	embeddings = build_embeddings()
	reranker = build_reranker()

	dataset = load_dataset()[offset:offset + n]
	print(f"\nTiming {len(dataset)} cases (offset {offset})...\n")

	col_q = 50
	print(f"{'Started at':<12} {'Secs':>6} {'First tool':<16} {'Question':<{col_q}} Tools")
	print("-" * 130)

	results = []

	for case in dataset:
	question = case["question"]
	reference = case.get("ground_truth", "")
	expected = case.get("expected_behavior", "answer")

	start_clock = datetime.now().strftime("%H:%M:%S")
	start = time.perf_counter()

	trace: list = []
	error = None
	answer = ""
	try:
	answer = run_tool_agent(
	question,
	llm=llm,
	embeddings=embeddings,
	reranker=reranker,
	trace=trace,
	)
	except Exception as exc:
	error = f"{type(exc).__name__}: {exc}"

	elapsed = time.perf_counter() - start
	tool_sequence = [step["tool"] for step in trace if "tool" in step]
	first_tool = tool_sequence[0] if tool_sequence else "(none)"

	results.append({
	"question": question,
	"ground_truth": reference,
	"expected_behavior": expected,
	"answer": answer,
	"error": error,
	"elapsed_sec": round(elapsed, 1),
	"tools": tool_sequence,
	})

	tail = error if error else ""
	print(
	f"{start_clock:<12} {elapsed:6.1f} {first_tool:<16} "
	f"{question[:col_q]:<{col_q}} {tool_sequence} {tail}"
	)

	# When running a later slice, append to whatever is already on disk.
	if offset > 0:
	try:
	with open(RESULTS_PATH, "r", encoding="utf-8") as f:
	existing = json.load(f)
	except FileNotFoundError:
	existing = []
	results = existing + results

	with open(RESULTS_PATH, "w", encoding="utf-8") as f:
	json.dump(results, f, ensure_ascii=False, indent=2)

	print("-" * 130)
	print(f"Wrote {len(results)} total results (with answers) to {RESULTS_PATH}")


	if __name__ == "__main__":
	main()