Spaces:

RyanDDD
/

hhh

Sleeping

hhh
File size: 3,852 Bytes
"""
Helper functions for creating evaluation results in the standardized format.
"""
from typing import List, Optional
from custom_types import (
    Utterance, EvaluationResult, UtteranceScore, SegmentScore,
    CategoricalScore, NumericalScore, MetricScore
)


def create_categorical_score(
    label: str,
    confidence: Optional[float] = None
) -> CategoricalScore:
    """
    Create a categorical score.
    
    Args:
        label: Category label (e.g., "High", "Change", "Positive")
        confidence: Optional confidence score 0-1
        
    Returns:
        CategoricalScore
    """
    return {
        "type": "categorical",
        "label": label,
        "confidence": confidence
    }


def create_numerical_score(
    value: float,
    max_value: float,
    label: Optional[str] = None
) -> NumericalScore:
    """
    Create a numerical score.
    
    Args:
        value: The score value
        max_value: Maximum possible score
        label: Optional derived label (e.g., "High" if value > threshold)
        
    Returns:
        NumericalScore
    """
    return {
        "type": "numerical",
        "value": value,
        "max_value": max_value,
        "label": label
    }


def create_utterance_result(
    conversation: List[Utterance],
    scores_per_utterance: List[dict[str, MetricScore]]
) -> EvaluationResult:
    """
    Create an utterance-level evaluation result.
    
    Args:
        conversation: The full conversation
        scores_per_utterance: List of metric scores, one dict per utterance
        
    Returns:
        EvaluationResult with granularity="utterance"
    """
    per_utterance: List[UtteranceScore] = []
    for i, scores in enumerate(scores_per_utterance):
        per_utterance.append({
            "index": i,
            "metrics": scores
        })
    
    return {
        "granularity": "utterance",
        "overall": None,
        "per_utterance": per_utterance,
        "per_segment": None
    }


def create_conversation_result(
    overall_scores: dict[str, MetricScore]
) -> EvaluationResult:
    """
    Create a conversation-level evaluation result.
    
    Args:
        overall_scores: Aggregate scores for the entire conversation
        
    Returns:
        EvaluationResult with granularity="conversation"
    """
    return {
        "granularity": "conversation",
        "overall": overall_scores,
        "per_utterance": None,
        "per_segment": None
    }


def create_segment_result(
    segments: List[tuple[List[int], dict[str, MetricScore]]]
) -> EvaluationResult:
    """
    Create a segment-level evaluation result.
    
    Args:
        segments: List of (utterance_indices, scores) tuples
        
    Returns:
        EvaluationResult with granularity="segment"
    """
    per_segment: List[SegmentScore] = []
    for utterance_indices, scores in segments:
        per_segment.append({
            "utterance_indices": utterance_indices,
            "metrics": scores
        })
    
    return {
        "granularity": "segment",
        "overall": None,
        "per_utterance": None,
        "per_segment": per_segment
    }


def derive_label_from_score(value: float, max_value: float, thresholds: Optional[dict] = None) -> str:
    """
    Derive a categorical label from a numerical score.
    
    Args:
        value: The score value
        max_value: Maximum possible score
        thresholds: Optional custom thresholds. Default: {"Low": 0.33, "Medium": 0.66, "High": 1.0}
        
    Returns:
        Label string
    """
    if thresholds is None:
        thresholds = {"Low": 0.33, "Medium": 0.66, "High": 1.0}
    
    ratio = value / max_value
    for label, threshold in sorted(thresholds.items(), key=lambda x: x[1]):
        if ratio <= threshold:
            return label
    
    return list(thresholds.keys())[-1]