Spaces:

aether-raid
/

atc-tts-mos

Sleeping

File size: 21,801 Bytes

# backend/session_manager.py

import json
import random
import time
import uuid
from typing import List, Dict, Any, Optional

from .models import Clip, get_display_model_name


class SessionManager:
    """Manages evaluation sessions, responses, and export logic."""

    def __init__(self, data_manager):
        self.data_manager = data_manager
        self.sessions: Dict[str, Dict[str, Any]] = {}
        self.responses: Dict[str, List[Dict[str, Any]]] = {
            "mos": [],
            "ab": [],
            "feedback": [],
        }

    # --------------------------
    # Session creation
    # --------------------------
    def create_session(self) -> Dict[str, Any]:
        session_id = str(uuid.uuid4())
        clips = self.data_manager.load_clips()

        rng = random.Random(time.time())

        mos_clips: List[Clip] = []
        models = {clip.model for clip in clips}

        # Build MOS clip set
        for model in models:
            model_clips = [clip for clip in clips if clip.model == model]

            exercise_groups: Dict[str, Dict[str, List[Clip]]] = {}
            for clip in model_clips:
                if clip.exercise_id not in exercise_groups:
                    exercise_groups[clip.exercise_id] = {"male": [], "female": []}
                exercise_groups[clip.exercise_id][clip.speaker].append(clip)

            # Collect all clips from this model
            all_model_clips = []
            for _, speakers in exercise_groups.items():
                if speakers["male"]:
                    all_model_clips.extend(speakers["male"])
                if speakers["female"]:
                    all_model_clips.extend(speakers["female"])

            # Select 3 random clips (regardless of gender pairing) for this model
            selected_clips = rng.sample(all_model_clips, min(3, len(all_model_clips)))
            mos_clips.extend(selected_clips)

        # Group by content (exercise + transcript) for comparisons
        content_groups: Dict[Any, List[Clip]] = {}
        for clip in clips:
            key = (clip.exercise, clip.exercise_id, clip.transcript)
            content_groups.setdefault(key, []).append(clip)


        # --- Model vs Model (same gender, same exercise) ---
        ab_model_pairs = []
        # Get all unique exercises
        all_exercises = list({key[1] for key in content_groups.keys()})
        rng.shuffle(all_exercises)
        max_pairs = 6
        for exercise_id in all_exercises:
            # Find all content groups for this exercise
            matching_keys = [k for k in content_groups if k[1] == exercise_id]
            if not matching_keys:
                continue
            # Pick a random content group for this exercise
            key = rng.choice(matching_keys)
            group = content_groups[key]
            # Group by model and speaker
            model_speaker_map: Dict[str, Dict[str, List[Clip]]] = {}
            for clip in group:
                model_speaker_map.setdefault(clip.model, {}).setdefault(clip.speaker, []).append(clip)
            model_names = list(model_speaker_map.keys())
            if len(model_names) < 2:
                continue
            # Try to find a random valid gender for this exercise
            valid_genders = [s for s in ["male", "female"] if sum(1 for m in model_names if s in model_speaker_map[m] and model_speaker_map[m][s]) >= 2]
            if not valid_genders:
                continue
            speaker = rng.choice(valid_genders)
            available_models = [model for model in model_names if speaker in model_speaker_map[model] and model_speaker_map[model][speaker]]
            if len(available_models) < 2:
                continue
            model_a, model_b = rng.sample(available_models, 2)
            clip_a = rng.choice(model_speaker_map[model_a][speaker])
            clip_b = rng.choice(model_speaker_map[model_b][speaker])
            ab_model_pairs.append((clip_a, clip_b))
            if len(ab_model_pairs) >= max_pairs:
                break

        # --- Gender vs Gender (same model, same exercise) ---
        ab_gender_pairs = []
        rng.shuffle(all_exercises)
        for exercise_id in all_exercises:
            matching_keys = [k for k in content_groups if k[1] == exercise_id]
            if not matching_keys:
                continue
            key = rng.choice(matching_keys)
            group = content_groups[key]
            # Group by model and gender
            model_gender_groups: Dict[str, Dict[str, List[Clip]]] = {}
            for clip in group:
                model_gender_groups.setdefault(clip.model, {}).setdefault(clip.speaker, []).append(clip)
            valid_models = [m for m, genders in model_gender_groups.items() if "male" in genders and "female" in genders and genders["male"] and genders["female"]]
            if not valid_models:
                continue
            model = rng.choice(valid_models)
            gender_groups = model_gender_groups[model]
            clip_male = rng.choice(gender_groups["male"])
            clip_female = rng.choice(gender_groups["female"])
            ab_gender_pairs.append((clip_male, clip_female))
            if len(ab_gender_pairs) >= max_pairs:
                break

        session_data: Dict[str, Any] = {
            "session_id": session_id,
            "created_at": time.time(),
            "mos_clips": mos_clips,
            "ab_model_pairs": ab_model_pairs,
            "ab_gender_pairs": ab_gender_pairs,
            "completed": False,
        }

        self.sessions[session_id] = session_data
        return session_data

    # --------------------------
    # Response storage helpers
    # --------------------------
    def save_response(self, response_type: str, response: Dict[str, Any]):
        """Generic low-level append with auto-timestamp."""
        if "timestamp" not in response:
            response["timestamp"] = time.time()
        self.responses.setdefault(response_type, []).append(response)

    def save_mos_rating(
        self,
        session: Dict[str, Any],
        clip_id: str,
        model: str,
        clarity: Optional[int],
        pronunciation: Optional[int],
        prosody: Optional[int],
        naturalness: Optional[int],
        overall: Optional[int],
        comment: str,
        gender_mismatch: bool,
    ) -> None:
        """Optional helper for saving a single MOS rating."""
        if not session:
            return

        mos_response = {
            "session_id": session["session_id"],
            "clip_id": clip_id,
            "clarity": int(clarity) if clarity is not None else None,
            "pronunciation": int(pronunciation) if pronunciation is not None else None,
            "prosody": int(prosody) if prosody is not None else None,
            "naturalness": int(naturalness) if naturalness is not None else None,
            "overall": int(overall) if overall is not None else None,
            "comment": comment or "",
            "gender_mismatch": bool(gender_mismatch),
            "timestamp": time.time(),
        }
        self.save_response("mos", mos_response)

    def save_ab_rating(
        self,
        session: Dict[str, Any],
        clip_a_id: str,
        clip_b_id: str,
        comparison_type: str,
        choice: str,
        comment: str,
        gender_mismatch_a: bool,
        gender_mismatch_b: bool,
    ) -> None:
        """Optional helper for saving a single A/B comparison."""
        if not session:
            return

        ab_response = {
            "session_id": session["session_id"],
            "clip_a_id": clip_a_id,
            "clip_b_id": clip_b_id,
            "comparison_type": comparison_type,
            "choice": choice,
            "comment": comment or "",
            "gender_mismatch_a": bool(gender_mismatch_a),
            "gender_mismatch_b": bool(gender_mismatch_b),
            "timestamp": time.time(),
        }
        self.save_response("ab", ab_response)

    # --------------------------
    # Bulk processing from JS JSON
    # --------------------------
    def process_mos_data(
        self,
        session: Dict[str, Any],
        mos_data_json: str,
    ) -> None:
        """
        Take the JSON string from the hidden MOS textbox and turn it into
        individual MOS responses in self.responses["mos"].
        """
        print(f"[DEBUG] process_mos_data called with JSON: '{mos_data_json}'")
        print(f"[DEBUG] Session ID: {session.get('session_id') if session else 'None'}")
        if not session or not mos_data_json:
            print(f"[DEBUG] Skipping MOS processing - session: {session is not None}, data length: {len(mos_data_json) if mos_data_json else 0}")
            return

        try:
            ratings_data = json.loads(mos_data_json) if mos_data_json else {}
        except json.JSONDecodeError as e:
            print(f"[WARN] Failed to parse MOS data JSON: {e}")
            return

        try:
            # Get all clips that were presented to the user
            presented_clips = session.get("mos_clips", [])
            presented_clip_ids = {clip.id for clip in presented_clips}
            
            print(f"[DEBUG] Presented {len(presented_clip_ids)} MOS clips to user")
            print(f"[DEBUG] Received ratings for {len(ratings_data)} clips")
            
            # Process all presented clips, whether rated or not
            for clip in presented_clips:
                clip_id = clip.id
                ratings = ratings_data.get(clip_id, {})
                
                mos_response = {
                    "session_id": session["session_id"],
                    "clip_id": clip_id,
                    "clarity": int(ratings.get("clarity"))
                    if ratings.get("clarity")
                    else None,
                    "pronunciation": int(ratings.get("pronunciation"))
                    if ratings.get("pronunciation")
                    else None,
                    "prosody": int(ratings.get("prosody"))
                    if ratings.get("prosody")
                    else None,
                    "naturalness": int(ratings.get("naturalness"))
                    if ratings.get("naturalness")
                    else None,
                    "overall": int(ratings.get("overall"))
                    if ratings.get("overall")
                    else None,
                    "comment": ratings.get("comment", ""),
                    "gender_mismatch": ratings.get("gender_mismatch", False),
                    "timestamp": time.time(),
                }

                self.save_response("mos", mos_response)
                
                # Log whether this clip was rated or not
                has_ratings = any(
                    ratings.get(dim)
                    for dim in ["clarity", "pronunciation", "prosody", "naturalness", "overall"]
                )
                status = "rated" if has_ratings else "not rated"
                print(f"[INFO] Processed MOS clip {clip_id} ({status})")

        except Exception as e:
            print(f"[WARN] Error processing MOS data: {e}")

    def process_ab_data(
        self,
        session: Dict[str, Any],
        ab_data_json: str,
    ) -> None:
        """
        Take the JSON string from the hidden AB textbox and turn it into
        individual A/B responses in self.responses["ab"].
        """
        print(f"[DEBUG] process_ab_data called with JSON: '{ab_data_json}'")
        print(f"[DEBUG] Session ID: {session.get('session_id') if session else 'None'}")
        if not session or not ab_data_json:
            print(f"[DEBUG] Skipping AB processing - session: {session is not None}, data length: {len(ab_data_json) if ab_data_json else 0}")
            return

        try:
            comparisons_data = json.loads(ab_data_json) if ab_data_json else {}
        except json.JSONDecodeError as e:
            print(f"[WARN] Failed to parse A/B data JSON: {e}")
            return

        try:
            print(f"[DEBUG] Received ratings for {len(comparisons_data)} comparisons")
            
            # Build a set of already saved comparison IDs to avoid duplicates
            session_id = session["session_id"]
            existing_ab_responses = [
                r for r in self.responses.get("ab", [])
                if r.get("session_id") == session_id
            ]
            existing_pairs = {
                (r["clip_a_id"], r["clip_b_id"])
                for r in existing_ab_responses
            }
            print(f"[DEBUG] Already have {len(existing_pairs)} AB comparisons saved for this session")
            
            # Collect ALL comparison types present in the data
            comparison_types = set()
            for comp_data in comparisons_data.values():
                if comp_data.get("comparison_type"):
                    comparison_types.add(comp_data["comparison_type"])
            
            print(f"[DEBUG] Found comparison types in data: {comparison_types}")
            
            # Process each comparison type separately
            all_presented_pairs = []
            for comparison_type in comparison_types:
                if comparison_type == "model_vs_model":
                    pairs = session.get("ab_model_pairs", [])
                    print(f"[DEBUG] Processing model-vs-model pairs: {len(pairs)} pairs presented")
                    all_presented_pairs.extend([(clip_a, clip_b, comparison_type) for clip_a, clip_b in pairs])
                elif comparison_type == "gender_vs_gender":
                    pairs = session.get("ab_gender_pairs", [])
                    print(f"[DEBUG] Processing gender-vs-gender pairs: {len(pairs)} pairs presented")
                    all_presented_pairs.extend([(clip_a, clip_b, comparison_type) for clip_a, clip_b in pairs])
                else:
                    print(f"[WARN] Unknown comparison type: {comparison_type}")
            
            # Process all presented pairs
            for clip_a, clip_b, comparison_type in all_presented_pairs:
                clip_a_id = clip_a.id
                clip_b_id = clip_b.id
                
                # Skip if we've already saved this pair
                if (clip_a_id, clip_b_id) in existing_pairs:
                    print(f"[DEBUG] Skipping duplicate comparison: {clip_a_id} vs {clip_b_id}")
                    continue
                
                # Find user's rating for this pair from the submitted data
                # JS sends numeric keys ("1", "2", etc.), so search by clip IDs
                comparison = {}
                for comp_data in comparisons_data.values():
                    if comp_data.get("clip_a_id") == clip_a_id and comp_data.get("clip_b_id") == clip_b_id:
                        comparison = comp_data
                        break
                
                ab_response = {
                    "session_id": session_id,
                    "clip_a_id": clip_a_id,
                    "clip_b_id": clip_b_id,
                    "comparison_type": comparison_type,
                    "choice": comparison.get("choice"),  # Can be None if not rated
                    "comment": comparison.get("comment", ""),
                    # Support both model_vs_model (gender_mismatch_a/b)
                    # and gender_vs_gender (gender_mismatch_male/female)
                    "gender_mismatch_a": comparison.get("gender_mismatch_a", False)
                    or comparison.get("gender_mismatch_male", False),
                    "gender_mismatch_b": comparison.get("gender_mismatch_b", False)
                    or comparison.get("gender_mismatch_female", False),
                    "timestamp": time.time(),
                }

                self.save_response("ab", ab_response)
                existing_pairs.add((clip_a_id, clip_b_id))  # Mark as saved
                
                status = "rated" if comparison.get("choice") else "not rated"
                print(f"[INFO] Processed A/B comparison {clip_a_id} vs {clip_b_id} ({status})")

        except Exception as e:
            print(f"[WARN] Error processing A/B data: {e}")

    # --------------------------
    # Export
    # --------------------------
    def export_session(self, session_id: str) -> Dict[str, Any]:
        """Build a fully annotated export dict for a given session."""
        session = self.sessions.get(session_id, {})

        # Create detailed MOS responses with full clip metadata
        detailed_mos_responses = []
        session_mos_clips = {clip.id: clip for clip in session.get("mos_clips", [])}

        for r in self.responses.get("mos", []):
            if r.get("session_id") != session_id:
                continue

            clip_id = r.get("clip_id")
            clip = session_mos_clips.get(clip_id)
            if not clip:
                continue

            detailed_response = {
                # Session metadata
                "session_id": session_id,
                "response_timestamp": r.get("timestamp", time.time()),
                # Full clip metadata
                "clip_id": clip_id,
                "exercise": clip.exercise,
                "exercise_id": clip.exercise_id,
                "transcript": clip.transcript,
                "model": clip.model,  # Original model name
                "display_model": get_display_model_name(
                    clip.model
                ),  # Anonymized name
                "speaker": clip.speaker,
                # MOS ratings
                "clarity": r.get("clarity"),
                "pronunciation": r.get("pronunciation"),
                "prosody": r.get("prosody"),
                "naturalness": r.get("naturalness"),
                "overall": r.get("overall"),
                "comment": r.get("comment", ""),
                # Quality control flags
                "gender_mismatch": r.get(
                    "gender_mismatch", False
                ),  # True if user flagged wrong gender
                # Response type
                "evaluation_type": "mos_rating",
            }
            detailed_mos_responses.append(detailed_response)

        # Create detailed A/B responses with full clip metadata
        detailed_ab_responses = []
        session_ab_model_pairs = session.get("ab_model_pairs", [])
        session_ab_gender_pairs = session.get("ab_gender_pairs", [])

        for r in self.responses.get("ab", []):
            if r.get("session_id") != session_id:
                continue

            clip_a_id = r.get("clip_a_id")
            clip_b_id = r.get("clip_b_id")
            comparison_type = r.get("comparison_type")

            # Find the clips from session pairs
            clip_a, clip_b = None, None

            if comparison_type == "model_vs_model":
                for pair_a, pair_b in session_ab_model_pairs:
                    if pair_a.id == clip_a_id and pair_b.id == clip_b_id:
                        clip_a, clip_b = pair_a, pair_b
                        break
            elif comparison_type == "gender_vs_gender":
                for pair_a, pair_b in session_ab_gender_pairs:
                    if pair_a.id == clip_a_id and pair_b.id == clip_b_id:
                        clip_a, clip_b = pair_a, pair_b
                        break

            if not (clip_a and clip_b):
                continue

            detailed_response = {
                # Session metadata
                "session_id": session_id,
                "response_timestamp": r.get("timestamp", time.time()),
                # Comparison metadata
                "comparison_type": comparison_type,
                "choice": r.get("choice"),
                "comment": r.get("comment", ""),
                # Clip A metadata
                "clip_a_id": clip_a.id,
                "clip_a_exercise": clip_a.exercise,
                "clip_a_exercise_id": clip_a.exercise_id,
                "clip_a_transcript": clip_a.transcript,
                "clip_a_model": clip_a.model,
                "clip_a_display_model": get_display_model_name(clip_a.model),
                "clip_a_speaker": clip_a.speaker,
                # Clip B metadata
                "clip_b_id": clip_b.id,
                "clip_b_exercise": clip_b.exercise,
                "clip_b_exercise_id": clip_b.exercise_id,
                "clip_b_transcript": clip_b.transcript,
                "clip_b_model": clip_b.model,
                "clip_b_display_model": get_display_model_name(clip_b.model),
                "clip_b_speaker": clip_b.speaker,
                # Quality control flags
                "gender_mismatch_a": r.get(
                    "gender_mismatch_a", False
                ),  # True if clip A has wrong gender
                "gender_mismatch_b": r.get(
                    "gender_mismatch_b", False
                ),  # True if clip B has wrong gender
                # Response type
                "evaluation_type": "ab_comparison",
            }
            detailed_ab_responses.append(detailed_response)

        return {
            "session_metadata": {
                "session_id": session_id,
                "created_at": session.get("created_at"),
                "completed": session.get("completed", False),
                "exported_at": time.time(),
                "total_mos_ratings": len(detailed_mos_responses),
                "total_ab_comparisons": len(detailed_ab_responses),
            },
            "mos_ratings": detailed_mos_responses,
            "ab_comparisons": detailed_ab_responses,
            "overall_feedback": [
                r
                for r in self.responses.get("feedback", [])
                if r.get("session_id") == session_id
            ],
        }