hhh / utils /evaluation_helpers.py
github-actions[bot]
Deploy from GitHub Actions (commit: 8b247ffacd77c0672965b8378f1d52a7dcd187ae)
9366995
"""
Helper functions for creating evaluation results in the standardized format.
"""
from typing import List, Optional
from custom_types import (
Utterance, EvaluationResult, UtteranceScore, SegmentScore,
CategoricalScore, NumericalScore, MetricScore
)
def create_categorical_score(
label: str,
confidence: Optional[float] = None
) -> CategoricalScore:
"""
Create a categorical score.
Args:
label: Category label (e.g., "High", "Change", "Positive")
confidence: Optional confidence score 0-1
Returns:
CategoricalScore
"""
return {
"type": "categorical",
"label": label,
"confidence": confidence
}
def create_numerical_score(
value: float,
max_value: float,
label: Optional[str] = None
) -> NumericalScore:
"""
Create a numerical score.
Args:
value: The score value
max_value: Maximum possible score
label: Optional derived label (e.g., "High" if value > threshold)
Returns:
NumericalScore
"""
return {
"type": "numerical",
"value": value,
"max_value": max_value,
"label": label
}
def create_utterance_result(
conversation: List[Utterance],
scores_per_utterance: List[dict[str, MetricScore]]
) -> EvaluationResult:
"""
Create an utterance-level evaluation result.
Args:
conversation: The full conversation
scores_per_utterance: List of metric scores, one dict per utterance
Returns:
EvaluationResult with granularity="utterance"
"""
per_utterance: List[UtteranceScore] = []
for i, scores in enumerate(scores_per_utterance):
per_utterance.append({
"index": i,
"metrics": scores
})
return {
"granularity": "utterance",
"overall": None,
"per_utterance": per_utterance,
"per_segment": None
}
def create_conversation_result(
overall_scores: dict[str, MetricScore]
) -> EvaluationResult:
"""
Create a conversation-level evaluation result.
Args:
overall_scores: Aggregate scores for the entire conversation
Returns:
EvaluationResult with granularity="conversation"
"""
return {
"granularity": "conversation",
"overall": overall_scores,
"per_utterance": None,
"per_segment": None
}
def create_segment_result(
segments: List[tuple[List[int], dict[str, MetricScore]]]
) -> EvaluationResult:
"""
Create a segment-level evaluation result.
Args:
segments: List of (utterance_indices, scores) tuples
Returns:
EvaluationResult with granularity="segment"
"""
per_segment: List[SegmentScore] = []
for utterance_indices, scores in segments:
per_segment.append({
"utterance_indices": utterance_indices,
"metrics": scores
})
return {
"granularity": "segment",
"overall": None,
"per_utterance": None,
"per_segment": per_segment
}
def derive_label_from_score(value: float, max_value: float, thresholds: Optional[dict] = None) -> str:
"""
Derive a categorical label from a numerical score.
Args:
value: The score value
max_value: Maximum possible score
thresholds: Optional custom thresholds. Default: {"Low": 0.33, "Medium": 0.66, "High": 1.0}
Returns:
Label string
"""
if thresholds is None:
thresholds = {"Low": 0.33, "Medium": 0.66, "High": 1.0}
ratio = value / max_value
for label, threshold in sorted(thresholds.items(), key=lambda x: x[1]):
if ratio <= threshold:
return label
return list(thresholds.keys())[-1]