Spaces:
Sleeping
Sleeping
File size: 2,461 Bytes
17a78b5 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 | """
Code-based evaluators for Cashy LangSmith experiments.
Each evaluator uses the new-style signature (outputs, reference_outputs)
supported in langsmith 0.7.0. They receive:
- outputs: dict returned by the target function (response, tools_called, tool_args, error)
- reference_outputs: dict from the dataset example's outputs field
"""
def tool_usage(outputs: dict, reference_outputs: dict) -> dict:
"""Check if at least one expected tool was called."""
expected = reference_outputs.get("expected_tools", [])
actual = outputs.get("tools_called", [])
if not expected:
score = 1
else:
score = 1 if any(t in actual for t in expected) else 0
return {"key": "tool_usage", "score": score}
def content_contains(outputs: dict, reference_outputs: dict) -> dict:
"""Check if all expected substrings appear in the response (case-insensitive)."""
expected = reference_outputs.get("expected_output_contains", [])
response = (outputs.get("response") or "").lower()
if not expected:
score = 1
else:
score = 1 if all(s.lower() in response for s in expected) else 0
return {"key": "content_contains", "score": score}
def tool_args_match(outputs: dict, reference_outputs: dict) -> dict:
"""Check if tool calls contain the expected arguments.
Compares each expected key-value pair against all actual tool call args.
Score = fraction of expected pairs that were found in any tool call.
"""
expected_args = reference_outputs.get("expected_tool_args", {})
actual_args_list = outputs.get("tool_args", [])
if not expected_args:
return {"key": "tool_args_match", "score": 1}
matched = 0
total = len(expected_args)
for key, expected_val in expected_args.items():
for actual_args in actual_args_list:
actual_val = actual_args.get(key)
if actual_val is not None and str(actual_val).lower() == str(expected_val).lower():
matched += 1
break
score = matched / total if total > 0 else 1
return {"key": "tool_args_match", "score": score}
def no_error(outputs: dict) -> dict:
"""Check that no error occurred during agent execution."""
error = outputs.get("error")
score = 1 if not error else 0
return {"key": "no_error", "score": score}
# List of all evaluators for easy import
all_evaluators = [tool_usage, content_contains, tool_args_match, no_error]
|