#!/usr/bin/env python3 """ Run LangSmith evaluation experiments for the Cashy agent. Usage: uv run python scripts/run_eval.py # Run experiment (default dataset + prefix) uv run python scripts/run_eval.py --prefix cashy-new-prompt # A/B test with custom prefix uv run python scripts/run_eval.py --dataset cashy-eval-v2.0 # Use specific dataset uv run python scripts/run_eval.py --upload # Upload eval cases to LangSmith uv run python scripts/run_eval.py --upload --file eval_cases/eval_cases_v1.json # Upload specific file """ import json import logging import argparse import sys from pathlib import Path # Add project root to path sys.path.insert(0, str(Path(__file__).parent.parent)) from dotenv import load_dotenv load_dotenv(Path(__file__).parent.parent / ".env") from langsmith.evaluation import evaluate from langsmith import Client from langchain_core.messages import HumanMessage from src.agent.graph import create_agent from scripts.evaluators import all_evaluators logging.basicConfig( level=logging.INFO, format="%(asctime)s [%(name)-12s] %(levelname)-7s %(message)s", datefmt="%H:%M:%S", ) for name in ("httpx", "httpcore", "urllib3", "hf_transfer"): logging.getLogger(name).setLevel(logging.WARNING) logger = logging.getLogger("cashy.eval") EVAL_FILE = Path(__file__).parent.parent / "eval_cases" / "eval_cases_v2.json" DEFAULT_DATASET = "cashy-eval-v2.0" DEFAULT_PREFIX = "cashy-baseline" def make_target(agent): """Create a target function that wraps the agent for langsmith evaluate().""" def run_agent(inputs: dict) -> dict: try: result = agent.invoke({"messages": [HumanMessage(content=inputs["input"])]}) response = result["messages"][-1].content tools_called = [] tool_args = [] for msg in result["messages"]: if hasattr(msg, "tool_calls") and msg.tool_calls: for tc in msg.tool_calls: tools_called.append(tc["name"]) tool_args.append(tc.get("args", {})) return { "response": response, "tools_called": tools_called, "tool_args": tool_args, "error": None, } except Exception as e: logger.error("Agent error: %s", e) return { "response": None, "tools_called": [], "tool_args": [], "error": str(e), } return run_agent def upload_to_langsmith(eval_data: dict): """Upload eval cases as a LangSmith dataset with enriched outputs.""" client = Client() version = eval_data["metadata"]["version"] dataset_name = f"cashy-eval-v{version}" try: dataset = client.create_dataset( dataset_name=dataset_name, description=eval_data["metadata"]["description"], ) logger.info("Created dataset: %s", dataset_name) except Exception: dataset = client.read_dataset(dataset_name=dataset_name) logger.info("Dataset already exists: %s", dataset_name) for case in eval_data["cases"]: client.create_example( inputs={"input": case["input"]}, outputs={ "expected_tools": case.get("expected_tools", []), "expected_output_contains": case.get("expected_output_contains", []), "expected_tool_args": case.get("expected_tool_args", {}), }, dataset_id=dataset.id, metadata={ "category": case.get("category"), "case_id": case["id"], "criteria": case.get("evaluation_criteria", []), }, ) logger.info("Uploaded %d examples to dataset '%s'", len(eval_data["cases"]), dataset_name) def main(): parser = argparse.ArgumentParser(description="Run Cashy LangSmith evaluation") parser.add_argument("--dataset", default=DEFAULT_DATASET, help="LangSmith dataset name") parser.add_argument("--prefix", default=DEFAULT_PREFIX, help="Experiment prefix (for A/B naming)") parser.add_argument("--upload", action="store_true", help="Upload eval cases to LangSmith dataset") parser.add_argument("--file", default=str(EVAL_FILE), help="Local eval cases JSON file") args = parser.parse_args() if args.upload: eval_data = json.loads(Path(args.file).read_text()) logger.info("Loaded %d eval cases from %s", len(eval_data["cases"]), args.file) upload_to_langsmith(eval_data) return logger.info("Creating agent...") agent = create_agent() target = make_target(agent) logger.info("Running experiment '%s' on dataset '%s'...", args.prefix, args.dataset) results = evaluate( target, data=args.dataset, evaluators=all_evaluators, experiment_prefix=args.prefix, max_concurrency=0, ) print("\n" + "=" * 60) print(f"Experiment '{args.prefix}' complete.") print(f"View results in LangSmith: Datasets > {args.dataset} > Experiments") print("=" * 60) if __name__ == "__main__": main()