Spaces:

Matcry
/

Rabbook

Sleeping

App Files Files Community

Rabbook / evaluation /run_cases.py

Matcry

Deploy snapshot

c76423f 24 days ago

Raw

History Blame Contribute Delete

1.89 kB

	"""
	Run specific dataset cases by index (1-based) through the tool agent.

	Usage: python -m evaluation.run_cases 6 24 30 51 88 98
	Prints, per case: time, tools, the ground truth, and the full answer — so the
	answers can be judged by hand. Model is whatever RABBOOK_LLM_MODEL is set to.
	"""
	import sys
	import time
	import warnings

	from dotenv import load_dotenv

	load_dotenv()
	warnings.filterwarnings("ignore", category=DeprecationWarning)

	from core.config import DEFAULT_LLM_MODEL
	from agents.tool_agent import run_tool_agent
	from .eval_common import build_embeddings, build_llm, build_reranker, load_dataset


	def main():
	indices = [int(a) for a in sys.argv[1:]]
	if not indices:
	print("Give 1-based case indices, e.g. python -m evaluation.run_cases 6 24 30")
	return

	print(f"Model: {DEFAULT_LLM_MODEL}")
	print("Initializing models...")
	llm = build_llm()
	embeddings = build_embeddings()
	reranker = build_reranker()

	dataset = load_dataset()

	for idx in indices:
	case = dataset[idx - 1]
	question = case["question"]
	reference = case.get("ground_truth", "")
	expected = case.get("expected_behavior", "answer")

	start = time.perf_counter()
	trace: list = []
	try:
	answer = run_tool_agent(
	question, llm=llm, embeddings=embeddings, reranker=reranker, trace=trace
	)
	except Exception as exc:
	answer = f"(ERROR) {type(exc).__name__}: {exc}"
	elapsed = time.perf_counter() - start
	tools = [s["tool"] for s in trace if "tool" in s]

	print("\n" + "=" * 80)
	print(f"CASE {idx} ({expected}) \| {elapsed:.1f}s \| tools={tools}")
	print(f"Q: {question}")
	print(f"REF: {reference}")
	print(f"ANS: {answer if answer.strip() else '(empty)'}")


	if __name__ == "__main__":
	main()