from typing import TypedDict, List, Optional, Literal, Union


class Utterance(TypedDict):
    speaker: str
    text: str


# Score types (mutually exclusive)
class CategoricalScore(TypedDict):
    """Categorical evaluation: only label"""
    type: Literal["categorical"]
    label: str                      # e.g., "High", "Change", "Positive"
    confidence: Optional[float]     # Optional: 0-1 confidence if available


class NumericalScore(TypedDict):
    """Numerical evaluation: score with max value"""
    type: Literal["numerical"]
    value: float                    # e.g., 3.0, 0.85, 8.5
    max_value: float                # e.g., 5.0, 1.0, 10.0
    label: Optional[str]            # Optional: derived label like "High" if value > threshold


# Union type for metric scores
MetricScore = Union[CategoricalScore, NumericalScore]


# Evaluation result structures
class UtteranceScore(TypedDict):
    """Per-utterance evaluation result"""
    index: int                              # Index in original conversation
    metrics: dict[str, MetricScore]         # e.g., {"talk_type": {...}, "empathy_er": {...}}


class SegmentScore(TypedDict):
    """Multi-utterance segment evaluation result"""
    utterance_indices: List[int]            # Which utterances this segment covers
    metrics: dict[str, MetricScore]         # Aggregate metrics for this segment


class EvaluationResult(TypedDict):
    """
    Unified evaluation result format.
    
    Based on granularity, only one of overall/per_utterance/per_segment will be populated:
    - granularity="utterance": per_utterance has data
    - granularity="segment": per_segment has data
    - granularity="conversation": overall has data
    """
    granularity: Literal["utterance", "segment", "conversation"]
    overall: Optional[dict[str, MetricScore]]           # Conversation-level scores
    per_utterance: Optional[List[UtteranceScore]]       # Per-utterance scores
    per_segment: Optional[List[SegmentScore]]           # Per-segment scores