| | """ |
| | Human Evaluation Schema for Multimodal Coherence Assessment |
| | |
| | This module defines the data structures for collecting and storing |
| | human judgments of multimodal coherence. Designed for single-rater |
| | evaluation with bias mitigation strategies. |
| | """ |
| |
|
| | from __future__ import annotations |
| |
|
| | from dataclasses import dataclass, field |
| | from datetime import datetime |
| | from enum import Enum |
| | from typing import List, Optional, Dict, Any |
| | import json |
| | from pathlib import Path |
| |
|
| |
|
| | class LikertScale(int, Enum): |
| | """5-point Likert scale for coherence ratings.""" |
| | COMPLETELY_UNRELATED = 1 |
| | VAGUE_CONNECTION = 2 |
| | PARTIAL_MATCH = 3 |
| | MOSTLY_ALIGNED = 4 |
| | STRONG_ALIGNMENT = 5 |
| |
|
| |
|
| | @dataclass |
| | class CoherenceRubric: |
| | """ |
| | Structured rubric for consistent human evaluation. |
| | |
| | Each rating criterion has explicit descriptions for each Likert level |
| | to reduce subjectivity and improve intra-rater reliability. |
| | """ |
| |
|
| | text_image_rubric: Dict[int, str] = field(default_factory=lambda: { |
| | 1: "Completely unrelated: Image has no semantic connection to text", |
| | 2: "Vague thematic connection only: General theme matches but specifics differ", |
| | 3: "Partial match: Some elements align, others clearly don't", |
| | 4: "Mostly aligned: Most elements match, minor discrepancies", |
| | 5: "Strong semantic alignment: Image accurately represents text content", |
| | }) |
| |
|
| | text_audio_rubric: Dict[int, str] = field(default_factory=lambda: { |
| | 1: "Completely unrelated: Audio has no connection to described scene", |
| | 2: "Vague connection: General mood might match but sounds don't fit", |
| | 3: "Partial match: Some sounds fit the scene, others are mismatched", |
| | 4: "Mostly aligned: Audio largely fits the scene with minor issues", |
| | 5: "Strong alignment: Audio perfectly complements the described scene", |
| | }) |
| |
|
| | image_audio_rubric: Dict[int, str] = field(default_factory=lambda: { |
| | 1: "Completely unrelated: Audio doesn't match what's shown in image", |
| | 2: "Vague connection: Mood might match but sounds don't fit visuals", |
| | 3: "Partial match: Some sounds plausible for image, others not", |
| | 4: "Mostly aligned: Audio largely fits the visual scene", |
| | 5: "Strong alignment: Audio sounds exactly right for the visual", |
| | }) |
| |
|
| | overall_rubric: Dict[int, str] = field(default_factory=lambda: { |
| | 1: "No coherence: Modalities feel randomly combined", |
| | 2: "Weak coherence: Some connection but feels disjointed", |
| | 3: "Moderate coherence: Works together with noticeable gaps", |
| | 4: "Good coherence: Modalities complement each other well", |
| | 5: "Excellent coherence: Unified, immersive multimodal experience", |
| | }) |
| |
|
| |
|
| | @dataclass |
| | class HumanEvaluation: |
| | """ |
| | A single human evaluation of a multimodal sample. |
| | |
| | Attributes: |
| | sample_id: Unique identifier for the evaluated sample |
| | evaluator_id: Identifier for the human evaluator |
| | text_image_coherence: Rating of text-image alignment (1-5) |
| | text_audio_coherence: Rating of text-audio alignment (1-5) |
| | image_audio_coherence: Rating of image-audio alignment (1-5) |
| | overall_coherence: Holistic coherence rating (1-5) |
| | confidence: Self-reported confidence in ratings (1-5) |
| | notes: Optional free-text observations |
| | timestamp: When the evaluation was completed |
| | session_id: Evaluation session identifier (for tracking re-ratings) |
| | is_rerating: Whether this is a second pass for reliability check |
| | """ |
| | sample_id: str |
| | evaluator_id: str |
| | text_image_coherence: int |
| | text_audio_coherence: int |
| | image_audio_coherence: int |
| | overall_coherence: int |
| | confidence: int = 3 |
| | notes: str = "" |
| | timestamp: str = field(default_factory=lambda: datetime.now().isoformat()) |
| | session_id: str = "" |
| | is_rerating: bool = False |
| |
|
| | def __post_init__(self): |
| | """Validate ratings are within Likert scale bounds.""" |
| | for attr in ['text_image_coherence', 'text_audio_coherence', |
| | 'image_audio_coherence', 'overall_coherence', 'confidence']: |
| | value = getattr(self, attr) |
| | if not 1 <= value <= 5: |
| | raise ValueError(f"{attr} must be between 1 and 5, got {value}") |
| |
|
| | def mean_pairwise_score(self) -> float: |
| | """Average of the three pairwise coherence ratings.""" |
| | return (self.text_image_coherence + self.text_audio_coherence + |
| | self.image_audio_coherence) / 3.0 |
| |
|
| | def weighted_score(self, w_ti: float = 0.45, w_ta: float = 0.45, |
| | w_ia: float = 0.10) -> float: |
| | """ |
| | Weighted average matching MSCI weights for direct comparison. |
| | |
| | Default weights: text-image=0.45, text-audio=0.45, image-audio=0.10 |
| | """ |
| | total = w_ti + w_ta + w_ia |
| | return (w_ti * self.text_image_coherence + |
| | w_ta * self.text_audio_coherence + |
| | w_ia * self.image_audio_coherence) / (total * 5) |
| |
|
| | def to_dict(self) -> Dict[str, Any]: |
| | """Convert to dictionary for JSON serialization.""" |
| | return { |
| | "sample_id": self.sample_id, |
| | "evaluator_id": self.evaluator_id, |
| | "text_image_coherence": self.text_image_coherence, |
| | "text_audio_coherence": self.text_audio_coherence, |
| | "image_audio_coherence": self.image_audio_coherence, |
| | "overall_coherence": self.overall_coherence, |
| | "confidence": self.confidence, |
| | "notes": self.notes, |
| | "timestamp": self.timestamp, |
| | "session_id": self.session_id, |
| | "is_rerating": self.is_rerating, |
| | } |
| |
|
| | @classmethod |
| | def from_dict(cls, data: Dict[str, Any]) -> "HumanEvaluation": |
| | """Create instance from dictionary.""" |
| | return cls(**data) |
| |
|
| |
|
| | @dataclass |
| | class EvaluationSample: |
| | """ |
| | A sample prepared for human evaluation. |
| | |
| | Contains all information needed to present a sample to the evaluator |
| | while keeping condition labels hidden for blind evaluation. |
| | """ |
| | sample_id: str |
| | text_content: str |
| | image_path: str |
| | audio_path: str |
| | |
| | condition: str = "" |
| | mode: str = "" |
| | perturbation: str = "" |
| | msci_score: Optional[float] = None |
| | run_id: str = "" |
| | original_prompt: str = "" |
| |
|
| | def to_dict(self) -> Dict[str, Any]: |
| | """Convert to dictionary for JSON serialization.""" |
| | return { |
| | "sample_id": self.sample_id, |
| | "text_content": self.text_content, |
| | "image_path": self.image_path, |
| | "audio_path": self.audio_path, |
| | "condition": self.condition, |
| | "mode": self.mode, |
| | "perturbation": self.perturbation, |
| | "msci_score": self.msci_score, |
| | "run_id": self.run_id, |
| | "original_prompt": self.original_prompt, |
| | } |
| |
|
| | @classmethod |
| | def from_dict(cls, data: Dict[str, Any]) -> "EvaluationSample": |
| | """Create instance from dictionary.""" |
| | return cls(**data) |
| |
|
| |
|
| | @dataclass |
| | class EvaluationSession: |
| | """ |
| | Tracks a complete human evaluation session. |
| | |
| | Manages the list of samples to evaluate, collects evaluations, |
| | and supports saving/loading for interrupted sessions. |
| | """ |
| | session_id: str |
| | evaluator_id: str |
| | samples: List[EvaluationSample] |
| | evaluations: List[HumanEvaluation] = field(default_factory=list) |
| | current_index: int = 0 |
| | started_at: str = field(default_factory=lambda: datetime.now().isoformat()) |
| | completed_at: Optional[str] = None |
| | |
| | rerating_sample_ids: List[str] = field(default_factory=list) |
| |
|
| | @property |
| | def progress(self) -> float: |
| | """Completion percentage.""" |
| | if not self.samples: |
| | return 0.0 |
| | return len(self.evaluations) / len(self.samples) * 100 |
| |
|
| | @property |
| | def is_complete(self) -> bool: |
| | """Whether all samples have been evaluated.""" |
| | return len(self.evaluations) >= len(self.samples) |
| |
|
| | def get_current_sample(self) -> Optional[EvaluationSample]: |
| | """Get the next sample to evaluate.""" |
| | if self.current_index < len(self.samples): |
| | return self.samples[self.current_index] |
| | return None |
| |
|
| | def add_evaluation(self, evaluation: HumanEvaluation): |
| | """Add a completed evaluation and advance index.""" |
| | evaluation.session_id = self.session_id |
| | self.evaluations.append(evaluation) |
| | self.current_index += 1 |
| |
|
| | if self.is_complete: |
| | self.completed_at = datetime.now().isoformat() |
| |
|
| | def save(self, path: Path): |
| | """Save session state to JSON file.""" |
| | data = { |
| | "session_id": self.session_id, |
| | "evaluator_id": self.evaluator_id, |
| | "samples": [s.to_dict() for s in self.samples], |
| | "evaluations": [e.to_dict() for e in self.evaluations], |
| | "current_index": self.current_index, |
| | "started_at": self.started_at, |
| | "completed_at": self.completed_at, |
| | "rerating_sample_ids": self.rerating_sample_ids, |
| | } |
| | path.parent.mkdir(parents=True, exist_ok=True) |
| | with path.open("w", encoding="utf-8") as f: |
| | json.dump(data, f, indent=2, ensure_ascii=False) |
| |
|
| | @classmethod |
| | def load(cls, path: Path) -> "EvaluationSession": |
| | """Load session state from JSON file.""" |
| | with path.open("r", encoding="utf-8") as f: |
| | data = json.load(f) |
| |
|
| | return cls( |
| | session_id=data["session_id"], |
| | evaluator_id=data["evaluator_id"], |
| | samples=[EvaluationSample.from_dict(s) for s in data["samples"]], |
| | evaluations=[HumanEvaluation.from_dict(e) for e in data["evaluations"]], |
| | current_index=data["current_index"], |
| | started_at=data["started_at"], |
| | completed_at=data.get("completed_at"), |
| | rerating_sample_ids=data.get("rerating_sample_ids", []), |
| | ) |
| |
|
| |
|
| | @dataclass |
| | class ReliabilityMetrics: |
| | """ |
| | Intra-rater reliability metrics for single-evaluator studies. |
| | |
| | Computes agreement between first and second ratings of the same |
| | samples to assess consistency. |
| | """ |
| | kappa: float |
| | percent_agreement: float |
| | weighted_kappa: float |
| | mean_absolute_difference: float |
| | n_reratings: int |
| |
|
| | @property |
| | def is_acceptable(self) -> bool: |
| | """Threshold: κ ≥ 0.70 for acceptable self-consistency.""" |
| | return self.kappa >= 0.70 |
| |
|
| | def to_dict(self) -> Dict[str, Any]: |
| | """Convert to dictionary.""" |
| | return { |
| | "kappa": self.kappa, |
| | "percent_agreement": self.percent_agreement, |
| | "weighted_kappa": self.weighted_kappa, |
| | "mean_absolute_difference": self.mean_absolute_difference, |
| | "n_reratings": self.n_reratings, |
| | "is_acceptable": self.is_acceptable, |
| | } |
| |
|