Spaces:
Sleeping
Sleeping
File size: 5,259 Bytes
17a78b5 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 | #!/usr/bin/env python3
"""
Run LangSmith evaluation experiments for the Cashy agent.
Usage:
uv run python scripts/run_eval.py # Run experiment (default dataset + prefix)
uv run python scripts/run_eval.py --prefix cashy-new-prompt # A/B test with custom prefix
uv run python scripts/run_eval.py --dataset cashy-eval-v2.0 # Use specific dataset
uv run python scripts/run_eval.py --upload # Upload eval cases to LangSmith
uv run python scripts/run_eval.py --upload --file eval_cases/eval_cases_v1.json # Upload specific file
"""
import json
import logging
import argparse
import sys
from pathlib import Path
# Add project root to path
sys.path.insert(0, str(Path(__file__).parent.parent))
from dotenv import load_dotenv
load_dotenv(Path(__file__).parent.parent / ".env")
from langsmith.evaluation import evaluate
from langsmith import Client
from langchain_core.messages import HumanMessage
from src.agent.graph import create_agent
from scripts.evaluators import all_evaluators
logging.basicConfig(
level=logging.INFO,
format="%(asctime)s [%(name)-12s] %(levelname)-7s %(message)s",
datefmt="%H:%M:%S",
)
for name in ("httpx", "httpcore", "urllib3", "hf_transfer"):
logging.getLogger(name).setLevel(logging.WARNING)
logger = logging.getLogger("cashy.eval")
EVAL_FILE = Path(__file__).parent.parent / "eval_cases" / "eval_cases_v2.json"
DEFAULT_DATASET = "cashy-eval-v2.0"
DEFAULT_PREFIX = "cashy-baseline"
def make_target(agent):
"""Create a target function that wraps the agent for langsmith evaluate()."""
def run_agent(inputs: dict) -> dict:
try:
result = agent.invoke({"messages": [HumanMessage(content=inputs["input"])]})
response = result["messages"][-1].content
tools_called = []
tool_args = []
for msg in result["messages"]:
if hasattr(msg, "tool_calls") and msg.tool_calls:
for tc in msg.tool_calls:
tools_called.append(tc["name"])
tool_args.append(tc.get("args", {}))
return {
"response": response,
"tools_called": tools_called,
"tool_args": tool_args,
"error": None,
}
except Exception as e:
logger.error("Agent error: %s", e)
return {
"response": None,
"tools_called": [],
"tool_args": [],
"error": str(e),
}
return run_agent
def upload_to_langsmith(eval_data: dict):
"""Upload eval cases as a LangSmith dataset with enriched outputs."""
client = Client()
version = eval_data["metadata"]["version"]
dataset_name = f"cashy-eval-v{version}"
try:
dataset = client.create_dataset(
dataset_name=dataset_name,
description=eval_data["metadata"]["description"],
)
logger.info("Created dataset: %s", dataset_name)
except Exception:
dataset = client.read_dataset(dataset_name=dataset_name)
logger.info("Dataset already exists: %s", dataset_name)
for case in eval_data["cases"]:
client.create_example(
inputs={"input": case["input"]},
outputs={
"expected_tools": case.get("expected_tools", []),
"expected_output_contains": case.get("expected_output_contains", []),
"expected_tool_args": case.get("expected_tool_args", {}),
},
dataset_id=dataset.id,
metadata={
"category": case.get("category"),
"case_id": case["id"],
"criteria": case.get("evaluation_criteria", []),
},
)
logger.info("Uploaded %d examples to dataset '%s'", len(eval_data["cases"]), dataset_name)
def main():
parser = argparse.ArgumentParser(description="Run Cashy LangSmith evaluation")
parser.add_argument("--dataset", default=DEFAULT_DATASET, help="LangSmith dataset name")
parser.add_argument("--prefix", default=DEFAULT_PREFIX, help="Experiment prefix (for A/B naming)")
parser.add_argument("--upload", action="store_true", help="Upload eval cases to LangSmith dataset")
parser.add_argument("--file", default=str(EVAL_FILE), help="Local eval cases JSON file")
args = parser.parse_args()
if args.upload:
eval_data = json.loads(Path(args.file).read_text())
logger.info("Loaded %d eval cases from %s", len(eval_data["cases"]), args.file)
upload_to_langsmith(eval_data)
return
logger.info("Creating agent...")
agent = create_agent()
target = make_target(agent)
logger.info("Running experiment '%s' on dataset '%s'...", args.prefix, args.dataset)
results = evaluate(
target,
data=args.dataset,
evaluators=all_evaluators,
experiment_prefix=args.prefix,
max_concurrency=0,
)
print("\n" + "=" * 60)
print(f"Experiment '{args.prefix}' complete.")
print(f"View results in LangSmith: Datasets > {args.dataset} > Experiments")
print("=" * 60)
if __name__ == "__main__":
main()
|