| """ |
| Run specific dataset cases by index (1-based) through the tool agent. |
| |
| Usage: python -m evaluation.run_cases 6 24 30 51 88 98 |
| Prints, per case: time, tools, the ground truth, and the full answer — so the |
| answers can be judged by hand. Model is whatever RABBOOK_LLM_MODEL is set to. |
| """ |
| import sys |
| import time |
| import warnings |
|
|
| from dotenv import load_dotenv |
|
|
| load_dotenv() |
| warnings.filterwarnings("ignore", category=DeprecationWarning) |
|
|
| from core.config import DEFAULT_LLM_MODEL |
| from agents.tool_agent import run_tool_agent |
| from .eval_common import build_embeddings, build_llm, build_reranker, load_dataset |
|
|
|
|
| def main(): |
| indices = [int(a) for a in sys.argv[1:]] |
| if not indices: |
| print("Give 1-based case indices, e.g. python -m evaluation.run_cases 6 24 30") |
| return |
|
|
| print(f"Model: {DEFAULT_LLM_MODEL}") |
| print("Initializing models...") |
| llm = build_llm() |
| embeddings = build_embeddings() |
| reranker = build_reranker() |
|
|
| dataset = load_dataset() |
|
|
| for idx in indices: |
| case = dataset[idx - 1] |
| question = case["question"] |
| reference = case.get("ground_truth", "") |
| expected = case.get("expected_behavior", "answer") |
|
|
| start = time.perf_counter() |
| trace: list = [] |
| try: |
| answer = run_tool_agent( |
| question, llm=llm, embeddings=embeddings, reranker=reranker, trace=trace |
| ) |
| except Exception as exc: |
| answer = f"(ERROR) {type(exc).__name__}: {exc}" |
| elapsed = time.perf_counter() - start |
| tools = [s["tool"] for s in trace if "tool" in s] |
|
|
| print("\n" + "=" * 80) |
| print(f"CASE {idx} ({expected}) | {elapsed:.1f}s | tools={tools}") |
| print(f"Q: {question}") |
| print(f"REF: {reference}") |
| print(f"ANS: {answer if answer.strip() else '(empty)'}") |
|
|
|
|
| if __name__ == "__main__": |
| main() |
|
|