github-actions[bot]
Deploy from GitHub Actions (commit: 8b247ffacd77c0672965b8378f1d52a7dcd187ae)
9366995
| """ | |
| Helper functions for creating evaluation results in the standardized format. | |
| """ | |
| from typing import List, Optional | |
| from custom_types import ( | |
| Utterance, EvaluationResult, UtteranceScore, SegmentScore, | |
| CategoricalScore, NumericalScore, MetricScore | |
| ) | |
| def create_categorical_score( | |
| label: str, | |
| confidence: Optional[float] = None | |
| ) -> CategoricalScore: | |
| """ | |
| Create a categorical score. | |
| Args: | |
| label: Category label (e.g., "High", "Change", "Positive") | |
| confidence: Optional confidence score 0-1 | |
| Returns: | |
| CategoricalScore | |
| """ | |
| return { | |
| "type": "categorical", | |
| "label": label, | |
| "confidence": confidence | |
| } | |
| def create_numerical_score( | |
| value: float, | |
| max_value: float, | |
| label: Optional[str] = None | |
| ) -> NumericalScore: | |
| """ | |
| Create a numerical score. | |
| Args: | |
| value: The score value | |
| max_value: Maximum possible score | |
| label: Optional derived label (e.g., "High" if value > threshold) | |
| Returns: | |
| NumericalScore | |
| """ | |
| return { | |
| "type": "numerical", | |
| "value": value, | |
| "max_value": max_value, | |
| "label": label | |
| } | |
| def create_utterance_result( | |
| conversation: List[Utterance], | |
| scores_per_utterance: List[dict[str, MetricScore]] | |
| ) -> EvaluationResult: | |
| """ | |
| Create an utterance-level evaluation result. | |
| Args: | |
| conversation: The full conversation | |
| scores_per_utterance: List of metric scores, one dict per utterance | |
| Returns: | |
| EvaluationResult with granularity="utterance" | |
| """ | |
| per_utterance: List[UtteranceScore] = [] | |
| for i, scores in enumerate(scores_per_utterance): | |
| per_utterance.append({ | |
| "index": i, | |
| "metrics": scores | |
| }) | |
| return { | |
| "granularity": "utterance", | |
| "overall": None, | |
| "per_utterance": per_utterance, | |
| "per_segment": None | |
| } | |
| def create_conversation_result( | |
| overall_scores: dict[str, MetricScore] | |
| ) -> EvaluationResult: | |
| """ | |
| Create a conversation-level evaluation result. | |
| Args: | |
| overall_scores: Aggregate scores for the entire conversation | |
| Returns: | |
| EvaluationResult with granularity="conversation" | |
| """ | |
| return { | |
| "granularity": "conversation", | |
| "overall": overall_scores, | |
| "per_utterance": None, | |
| "per_segment": None | |
| } | |
| def create_segment_result( | |
| segments: List[tuple[List[int], dict[str, MetricScore]]] | |
| ) -> EvaluationResult: | |
| """ | |
| Create a segment-level evaluation result. | |
| Args: | |
| segments: List of (utterance_indices, scores) tuples | |
| Returns: | |
| EvaluationResult with granularity="segment" | |
| """ | |
| per_segment: List[SegmentScore] = [] | |
| for utterance_indices, scores in segments: | |
| per_segment.append({ | |
| "utterance_indices": utterance_indices, | |
| "metrics": scores | |
| }) | |
| return { | |
| "granularity": "segment", | |
| "overall": None, | |
| "per_utterance": None, | |
| "per_segment": per_segment | |
| } | |
| def derive_label_from_score(value: float, max_value: float, thresholds: Optional[dict] = None) -> str: | |
| """ | |
| Derive a categorical label from a numerical score. | |
| Args: | |
| value: The score value | |
| max_value: Maximum possible score | |
| thresholds: Optional custom thresholds. Default: {"Low": 0.33, "Medium": 0.66, "High": 1.0} | |
| Returns: | |
| Label string | |
| """ | |
| if thresholds is None: | |
| thresholds = {"Low": 0.33, "Medium": 0.66, "High": 1.0} | |
| ratio = value / max_value | |
| for label, threshold in sorted(thresholds.items(), key=lambda x: x[1]): | |
| if ratio <= threshold: | |
| return label | |
| return list(thresholds.keys())[-1] | |