File size: 2,461 Bytes
17a78b5
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
"""
Code-based evaluators for Cashy LangSmith experiments.

Each evaluator uses the new-style signature (outputs, reference_outputs)
supported in langsmith 0.7.0. They receive:
  - outputs: dict returned by the target function (response, tools_called, tool_args, error)
  - reference_outputs: dict from the dataset example's outputs field
"""


def tool_usage(outputs: dict, reference_outputs: dict) -> dict:
    """Check if at least one expected tool was called."""
    expected = reference_outputs.get("expected_tools", [])
    actual = outputs.get("tools_called", [])

    if not expected:
        score = 1
    else:
        score = 1 if any(t in actual for t in expected) else 0

    return {"key": "tool_usage", "score": score}


def content_contains(outputs: dict, reference_outputs: dict) -> dict:
    """Check if all expected substrings appear in the response (case-insensitive)."""
    expected = reference_outputs.get("expected_output_contains", [])
    response = (outputs.get("response") or "").lower()

    if not expected:
        score = 1
    else:
        score = 1 if all(s.lower() in response for s in expected) else 0

    return {"key": "content_contains", "score": score}


def tool_args_match(outputs: dict, reference_outputs: dict) -> dict:
    """Check if tool calls contain the expected arguments.

    Compares each expected key-value pair against all actual tool call args.
    Score = fraction of expected pairs that were found in any tool call.
    """
    expected_args = reference_outputs.get("expected_tool_args", {})
    actual_args_list = outputs.get("tool_args", [])

    if not expected_args:
        return {"key": "tool_args_match", "score": 1}

    matched = 0
    total = len(expected_args)

    for key, expected_val in expected_args.items():
        for actual_args in actual_args_list:
            actual_val = actual_args.get(key)
            if actual_val is not None and str(actual_val).lower() == str(expected_val).lower():
                matched += 1
                break

    score = matched / total if total > 0 else 1
    return {"key": "tool_args_match", "score": score}


def no_error(outputs: dict) -> dict:
    """Check that no error occurred during agent execution."""
    error = outputs.get("error")
    score = 1 if not error else 0
    return {"key": "no_error", "score": score}


# List of all evaluators for easy import
all_evaluators = [tool_usage, content_contains, tool_args_match, no_error]