from typing import TypedDict, List, Optional, Literal, Union class Utterance(TypedDict): speaker: str text: str # Score types (mutually exclusive) class CategoricalScore(TypedDict): """Categorical evaluation: only label""" type: Literal["categorical"] label: str # e.g., "High", "Change", "Positive" confidence: Optional[float] # Optional: 0-1 confidence if available class NumericalScore(TypedDict): """Numerical evaluation: score with max value""" type: Literal["numerical"] value: float # e.g., 3.0, 0.85, 8.5 max_value: float # e.g., 5.0, 1.0, 10.0 label: Optional[str] # Optional: derived label like "High" if value > threshold # Union type for metric scores MetricScore = Union[CategoricalScore, NumericalScore] # Evaluation result structures class UtteranceScore(TypedDict): """Per-utterance evaluation result""" index: int # Index in original conversation metrics: dict[str, MetricScore] # e.g., {"talk_type": {...}, "empathy_er": {...}} class SegmentScore(TypedDict): """Multi-utterance segment evaluation result""" utterance_indices: List[int] # Which utterances this segment covers metrics: dict[str, MetricScore] # Aggregate metrics for this segment class EvaluationResult(TypedDict): """ Unified evaluation result format. Based on granularity, only one of overall/per_utterance/per_segment will be populated: - granularity="utterance": per_utterance has data - granularity="segment": per_segment has data - granularity="conversation": overall has data """ granularity: Literal["utterance", "segment", "conversation"] overall: Optional[dict[str, MetricScore]] # Conversation-level scores per_utterance: Optional[List[UtteranceScore]] # Per-utterance scores per_segment: Optional[List[SegmentScore]] # Per-segment scores