""" Helper functions for creating evaluation results in the standardized format. """ from typing import List, Optional from custom_types import ( Utterance, EvaluationResult, UtteranceScore, SegmentScore, CategoricalScore, NumericalScore, MetricScore ) def create_categorical_score( label: str, confidence: Optional[float] = None ) -> CategoricalScore: """ Create a categorical score. Args: label: Category label (e.g., "High", "Change", "Positive") confidence: Optional confidence score 0-1 Returns: CategoricalScore """ return { "type": "categorical", "label": label, "confidence": confidence } def create_numerical_score( value: float, max_value: float, label: Optional[str] = None ) -> NumericalScore: """ Create a numerical score. Args: value: The score value max_value: Maximum possible score label: Optional derived label (e.g., "High" if value > threshold) Returns: NumericalScore """ return { "type": "numerical", "value": value, "max_value": max_value, "label": label } def create_utterance_result( conversation: List[Utterance], scores_per_utterance: List[dict[str, MetricScore]] ) -> EvaluationResult: """ Create an utterance-level evaluation result. Args: conversation: The full conversation scores_per_utterance: List of metric scores, one dict per utterance Returns: EvaluationResult with granularity="utterance" """ per_utterance: List[UtteranceScore] = [] for i, scores in enumerate(scores_per_utterance): per_utterance.append({ "index": i, "metrics": scores }) return { "granularity": "utterance", "overall": None, "per_utterance": per_utterance, "per_segment": None } def create_conversation_result( overall_scores: dict[str, MetricScore] ) -> EvaluationResult: """ Create a conversation-level evaluation result. Args: overall_scores: Aggregate scores for the entire conversation Returns: EvaluationResult with granularity="conversation" """ return { "granularity": "conversation", "overall": overall_scores, "per_utterance": None, "per_segment": None } def create_segment_result( segments: List[tuple[List[int], dict[str, MetricScore]]] ) -> EvaluationResult: """ Create a segment-level evaluation result. Args: segments: List of (utterance_indices, scores) tuples Returns: EvaluationResult with granularity="segment" """ per_segment: List[SegmentScore] = [] for utterance_indices, scores in segments: per_segment.append({ "utterance_indices": utterance_indices, "metrics": scores }) return { "granularity": "segment", "overall": None, "per_utterance": None, "per_segment": per_segment } def derive_label_from_score(value: float, max_value: float, thresholds: Optional[dict] = None) -> str: """ Derive a categorical label from a numerical score. Args: value: The score value max_value: Maximum possible score thresholds: Optional custom thresholds. Default: {"Low": 0.33, "Medium": 0.66, "High": 1.0} Returns: Label string """ if thresholds is None: thresholds = {"Low": 0.33, "Medium": 0.66, "High": 1.0} ratio = value / max_value for label, threshold in sorted(thresholds.items(), key=lambda x: x[1]): if ratio <= threshold: return label return list(thresholds.keys())[-1]