Spaces:

SeasonalFall84
/

Cashy

Sleeping

GitHub Actions

Deploy to HF Spaces

17a78b5 12 days ago

2.46 kB

	"""
	Code-based evaluators for Cashy LangSmith experiments.

	Each evaluator uses the new-style signature (outputs, reference_outputs)
	supported in langsmith 0.7.0. They receive:
	- outputs: dict returned by the target function (response, tools_called, tool_args, error)
	- reference_outputs: dict from the dataset example's outputs field
	"""


	def tool_usage(outputs: dict, reference_outputs: dict) -> dict:
	"""Check if at least one expected tool was called."""
	expected = reference_outputs.get("expected_tools", [])
	actual = outputs.get("tools_called", [])

	if not expected:
	score = 1
	else:
	score = 1 if any(t in actual for t in expected) else 0

	return {"key": "tool_usage", "score": score}


	def content_contains(outputs: dict, reference_outputs: dict) -> dict:
	"""Check if all expected substrings appear in the response (case-insensitive)."""
	expected = reference_outputs.get("expected_output_contains", [])
	response = (outputs.get("response") or "").lower()

	if not expected:
	score = 1
	else:
	score = 1 if all(s.lower() in response for s in expected) else 0

	return {"key": "content_contains", "score": score}


	def tool_args_match(outputs: dict, reference_outputs: dict) -> dict:
	"""Check if tool calls contain the expected arguments.

	Compares each expected key-value pair against all actual tool call args.
	Score = fraction of expected pairs that were found in any tool call.
	"""
	expected_args = reference_outputs.get("expected_tool_args", {})
	actual_args_list = outputs.get("tool_args", [])

	if not expected_args:
	return {"key": "tool_args_match", "score": 1}

	matched = 0
	total = len(expected_args)

	for key, expected_val in expected_args.items():
	for actual_args in actual_args_list:
	actual_val = actual_args.get(key)
	if actual_val is not None and str(actual_val).lower() == str(expected_val).lower():
	matched += 1
	break

	score = matched / total if total > 0 else 1
	return {"key": "tool_args_match", "score": score}


	def no_error(outputs: dict) -> dict:
	"""Check that no error occurred during agent execution."""
	error = outputs.get("error")
	score = 1 if not error else 0
	return {"key": "no_error", "score": score}


	# List of all evaluators for easy import
	all_evaluators = [tool_usage, content_contains, tool_args_match, no_error]