File size: 3,852 Bytes
9366995 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 |
"""
Helper functions for creating evaluation results in the standardized format.
"""
from typing import List, Optional
from custom_types import (
Utterance, EvaluationResult, UtteranceScore, SegmentScore,
CategoricalScore, NumericalScore, MetricScore
)
def create_categorical_score(
label: str,
confidence: Optional[float] = None
) -> CategoricalScore:
"""
Create a categorical score.
Args:
label: Category label (e.g., "High", "Change", "Positive")
confidence: Optional confidence score 0-1
Returns:
CategoricalScore
"""
return {
"type": "categorical",
"label": label,
"confidence": confidence
}
def create_numerical_score(
value: float,
max_value: float,
label: Optional[str] = None
) -> NumericalScore:
"""
Create a numerical score.
Args:
value: The score value
max_value: Maximum possible score
label: Optional derived label (e.g., "High" if value > threshold)
Returns:
NumericalScore
"""
return {
"type": "numerical",
"value": value,
"max_value": max_value,
"label": label
}
def create_utterance_result(
conversation: List[Utterance],
scores_per_utterance: List[dict[str, MetricScore]]
) -> EvaluationResult:
"""
Create an utterance-level evaluation result.
Args:
conversation: The full conversation
scores_per_utterance: List of metric scores, one dict per utterance
Returns:
EvaluationResult with granularity="utterance"
"""
per_utterance: List[UtteranceScore] = []
for i, scores in enumerate(scores_per_utterance):
per_utterance.append({
"index": i,
"metrics": scores
})
return {
"granularity": "utterance",
"overall": None,
"per_utterance": per_utterance,
"per_segment": None
}
def create_conversation_result(
overall_scores: dict[str, MetricScore]
) -> EvaluationResult:
"""
Create a conversation-level evaluation result.
Args:
overall_scores: Aggregate scores for the entire conversation
Returns:
EvaluationResult with granularity="conversation"
"""
return {
"granularity": "conversation",
"overall": overall_scores,
"per_utterance": None,
"per_segment": None
}
def create_segment_result(
segments: List[tuple[List[int], dict[str, MetricScore]]]
) -> EvaluationResult:
"""
Create a segment-level evaluation result.
Args:
segments: List of (utterance_indices, scores) tuples
Returns:
EvaluationResult with granularity="segment"
"""
per_segment: List[SegmentScore] = []
for utterance_indices, scores in segments:
per_segment.append({
"utterance_indices": utterance_indices,
"metrics": scores
})
return {
"granularity": "segment",
"overall": None,
"per_utterance": None,
"per_segment": per_segment
}
def derive_label_from_score(value: float, max_value: float, thresholds: Optional[dict] = None) -> str:
"""
Derive a categorical label from a numerical score.
Args:
value: The score value
max_value: Maximum possible score
thresholds: Optional custom thresholds. Default: {"Low": 0.33, "Medium": 0.66, "High": 1.0}
Returns:
Label string
"""
if thresholds is None:
thresholds = {"Low": 0.33, "Medium": 0.66, "High": 1.0}
ratio = value / max_value
for label, threshold in sorted(thresholds.items(), key=lambda x: x[1]):
if ratio <= threshold:
return label
return list(thresholds.keys())[-1]
|