Spaces:
Sleeping
Sleeping
| """ | |
| Code-based evaluators for Cashy LangSmith experiments. | |
| Each evaluator uses the new-style signature (outputs, reference_outputs) | |
| supported in langsmith 0.7.0. They receive: | |
| - outputs: dict returned by the target function (response, tools_called, tool_args, error) | |
| - reference_outputs: dict from the dataset example's outputs field | |
| """ | |
| def tool_usage(outputs: dict, reference_outputs: dict) -> dict: | |
| """Check if at least one expected tool was called.""" | |
| expected = reference_outputs.get("expected_tools", []) | |
| actual = outputs.get("tools_called", []) | |
| if not expected: | |
| score = 1 | |
| else: | |
| score = 1 if any(t in actual for t in expected) else 0 | |
| return {"key": "tool_usage", "score": score} | |
| def content_contains(outputs: dict, reference_outputs: dict) -> dict: | |
| """Check if all expected substrings appear in the response (case-insensitive).""" | |
| expected = reference_outputs.get("expected_output_contains", []) | |
| response = (outputs.get("response") or "").lower() | |
| if not expected: | |
| score = 1 | |
| else: | |
| score = 1 if all(s.lower() in response for s in expected) else 0 | |
| return {"key": "content_contains", "score": score} | |
| def tool_args_match(outputs: dict, reference_outputs: dict) -> dict: | |
| """Check if tool calls contain the expected arguments. | |
| Compares each expected key-value pair against all actual tool call args. | |
| Score = fraction of expected pairs that were found in any tool call. | |
| """ | |
| expected_args = reference_outputs.get("expected_tool_args", {}) | |
| actual_args_list = outputs.get("tool_args", []) | |
| if not expected_args: | |
| return {"key": "tool_args_match", "score": 1} | |
| matched = 0 | |
| total = len(expected_args) | |
| for key, expected_val in expected_args.items(): | |
| for actual_args in actual_args_list: | |
| actual_val = actual_args.get(key) | |
| if actual_val is not None and str(actual_val).lower() == str(expected_val).lower(): | |
| matched += 1 | |
| break | |
| score = matched / total if total > 0 else 1 | |
| return {"key": "tool_args_match", "score": score} | |
| def no_error(outputs: dict) -> dict: | |
| """Check that no error occurred during agent execution.""" | |
| error = outputs.get("error") | |
| score = 1 if not error else 0 | |
| return {"key": "no_error", "score": score} | |
| # List of all evaluators for easy import | |
| all_evaluators = [tool_usage, content_contains, tool_args_match, no_error] | |