File size: 5,259 Bytes
17a78b5
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
#!/usr/bin/env python3
"""
Run LangSmith evaluation experiments for the Cashy agent.

Usage:
    uv run python scripts/run_eval.py                                    # Run experiment (default dataset + prefix)
    uv run python scripts/run_eval.py --prefix cashy-new-prompt          # A/B test with custom prefix
    uv run python scripts/run_eval.py --dataset cashy-eval-v2.0          # Use specific dataset
    uv run python scripts/run_eval.py --upload                           # Upload eval cases to LangSmith
    uv run python scripts/run_eval.py --upload --file eval_cases/eval_cases_v1.json  # Upload specific file
"""

import json
import logging
import argparse
import sys
from pathlib import Path

# Add project root to path
sys.path.insert(0, str(Path(__file__).parent.parent))

from dotenv import load_dotenv
load_dotenv(Path(__file__).parent.parent / ".env")

from langsmith.evaluation import evaluate
from langsmith import Client
from langchain_core.messages import HumanMessage
from src.agent.graph import create_agent
from scripts.evaluators import all_evaluators

logging.basicConfig(
    level=logging.INFO,
    format="%(asctime)s [%(name)-12s] %(levelname)-7s %(message)s",
    datefmt="%H:%M:%S",
)
for name in ("httpx", "httpcore", "urllib3", "hf_transfer"):
    logging.getLogger(name).setLevel(logging.WARNING)

logger = logging.getLogger("cashy.eval")

EVAL_FILE = Path(__file__).parent.parent / "eval_cases" / "eval_cases_v2.json"
DEFAULT_DATASET = "cashy-eval-v2.0"
DEFAULT_PREFIX = "cashy-baseline"


def make_target(agent):
    """Create a target function that wraps the agent for langsmith evaluate()."""

    def run_agent(inputs: dict) -> dict:
        try:
            result = agent.invoke({"messages": [HumanMessage(content=inputs["input"])]})
            response = result["messages"][-1].content

            tools_called = []
            tool_args = []
            for msg in result["messages"]:
                if hasattr(msg, "tool_calls") and msg.tool_calls:
                    for tc in msg.tool_calls:
                        tools_called.append(tc["name"])
                        tool_args.append(tc.get("args", {}))

            return {
                "response": response,
                "tools_called": tools_called,
                "tool_args": tool_args,
                "error": None,
            }
        except Exception as e:
            logger.error("Agent error: %s", e)
            return {
                "response": None,
                "tools_called": [],
                "tool_args": [],
                "error": str(e),
            }

    return run_agent


def upload_to_langsmith(eval_data: dict):
    """Upload eval cases as a LangSmith dataset with enriched outputs."""
    client = Client()
    version = eval_data["metadata"]["version"]
    dataset_name = f"cashy-eval-v{version}"

    try:
        dataset = client.create_dataset(
            dataset_name=dataset_name,
            description=eval_data["metadata"]["description"],
        )
        logger.info("Created dataset: %s", dataset_name)
    except Exception:
        dataset = client.read_dataset(dataset_name=dataset_name)
        logger.info("Dataset already exists: %s", dataset_name)

    for case in eval_data["cases"]:
        client.create_example(
            inputs={"input": case["input"]},
            outputs={
                "expected_tools": case.get("expected_tools", []),
                "expected_output_contains": case.get("expected_output_contains", []),
                "expected_tool_args": case.get("expected_tool_args", {}),
            },
            dataset_id=dataset.id,
            metadata={
                "category": case.get("category"),
                "case_id": case["id"],
                "criteria": case.get("evaluation_criteria", []),
            },
        )
    logger.info("Uploaded %d examples to dataset '%s'", len(eval_data["cases"]), dataset_name)


def main():
    parser = argparse.ArgumentParser(description="Run Cashy LangSmith evaluation")
    parser.add_argument("--dataset", default=DEFAULT_DATASET, help="LangSmith dataset name")
    parser.add_argument("--prefix", default=DEFAULT_PREFIX, help="Experiment prefix (for A/B naming)")
    parser.add_argument("--upload", action="store_true", help="Upload eval cases to LangSmith dataset")
    parser.add_argument("--file", default=str(EVAL_FILE), help="Local eval cases JSON file")
    args = parser.parse_args()

    if args.upload:
        eval_data = json.loads(Path(args.file).read_text())
        logger.info("Loaded %d eval cases from %s", len(eval_data["cases"]), args.file)
        upload_to_langsmith(eval_data)
        return

    logger.info("Creating agent...")
    agent = create_agent()
    target = make_target(agent)

    logger.info("Running experiment '%s' on dataset '%s'...", args.prefix, args.dataset)
    results = evaluate(
        target,
        data=args.dataset,
        evaluators=all_evaluators,
        experiment_prefix=args.prefix,
        max_concurrency=0,
    )

    print("\n" + "=" * 60)
    print(f"Experiment '{args.prefix}' complete.")
    print(f"View results in LangSmith: Datasets > {args.dataset} > Experiments")
    print("=" * 60)


if __name__ == "__main__":
    main()