Spaces:

RyanDDD
/

hhh

Sleeping

hhh / utils /evaluation_helpers.py

github-actions[bot]

Deploy from GitHub Actions (commit: 8b247ffacd77c0672965b8378f1d52a7dcd187ae)

9366995 26 days ago

3.85 kB

	"""
	Helper functions for creating evaluation results in the standardized format.
	"""
	from typing import List, Optional
	from custom_types import (
	Utterance, EvaluationResult, UtteranceScore, SegmentScore,
	CategoricalScore, NumericalScore, MetricScore
	)


	def create_categorical_score(
	label: str,
	confidence: Optional[float] = None
	) -> CategoricalScore:
	"""
	Create a categorical score.

	Args:
	label: Category label (e.g., "High", "Change", "Positive")
	confidence: Optional confidence score 0-1

	Returns:
	CategoricalScore
	"""
	return {
	"type": "categorical",
	"label": label,
	"confidence": confidence
	}


	def create_numerical_score(
	value: float,
	max_value: float,
	label: Optional[str] = None
	) -> NumericalScore:
	"""
	Create a numerical score.

	Args:
	value: The score value
	max_value: Maximum possible score
	label: Optional derived label (e.g., "High" if value > threshold)

	Returns:
	NumericalScore
	"""
	return {
	"type": "numerical",
	"value": value,
	"max_value": max_value,
	"label": label
	}


	def create_utterance_result(
	conversation: List[Utterance],
	scores_per_utterance: List[dict[str, MetricScore]]
	) -> EvaluationResult:
	"""
	Create an utterance-level evaluation result.

	Args:
	conversation: The full conversation
	scores_per_utterance: List of metric scores, one dict per utterance

	Returns:
	EvaluationResult with granularity="utterance"
	"""
	per_utterance: List[UtteranceScore] = []
	for i, scores in enumerate(scores_per_utterance):
	per_utterance.append({
	"index": i,
	"metrics": scores
	})

	return {
	"granularity": "utterance",
	"overall": None,
	"per_utterance": per_utterance,
	"per_segment": None
	}


	def create_conversation_result(
	overall_scores: dict[str, MetricScore]
	) -> EvaluationResult:
	"""
	Create a conversation-level evaluation result.

	Args:
	overall_scores: Aggregate scores for the entire conversation

	Returns:
	EvaluationResult with granularity="conversation"
	"""
	return {
	"granularity": "conversation",
	"overall": overall_scores,
	"per_utterance": None,
	"per_segment": None
	}


	def create_segment_result(
	segments: List[tuple[List[int], dict[str, MetricScore]]]
	) -> EvaluationResult:
	"""
	Create a segment-level evaluation result.

	Args:
	segments: List of (utterance_indices, scores) tuples

	Returns:
	EvaluationResult with granularity="segment"
	"""
	per_segment: List[SegmentScore] = []
	for utterance_indices, scores in segments:
	per_segment.append({
	"utterance_indices": utterance_indices,
	"metrics": scores
	})

	return {
	"granularity": "segment",
	"overall": None,
	"per_utterance": None,
	"per_segment": per_segment
	}


	def derive_label_from_score(value: float, max_value: float, thresholds: Optional[dict] = None) -> str:
	"""
	Derive a categorical label from a numerical score.

	Args:
	value: The score value
	max_value: Maximum possible score
	thresholds: Optional custom thresholds. Default: {"Low": 0.33, "Medium": 0.66, "High": 1.0}

	Returns:
	Label string
	"""
	if thresholds is None:
	thresholds = {"Low": 0.33, "Medium": 0.66, "High": 1.0}

	ratio = value / max_value
	for label, threshold in sorted(thresholds.items(), key=lambda x: x[1]):
	if ratio <= threshold:
	return label

	return list(thresholds.keys())[-1]