Spaces:

anfastech
/

zlaqa-version-c-ai-enginee

Sleeping

File size: 4,833 Bytes

1cd6149

"""
Annotation Helper Utilities

Helper functions for phoneme-level annotation tasks.
"""

import json
import logging
from pathlib import Path
from typing import List, Dict, Any, Optional
import numpy as np

logger = logging.getLogger(__name__)


def load_annotations(annotations_file: Path = Path("data/annotations.json")) -> List[Dict[str, Any]]:
    """Load annotations from JSON file."""
    if not annotations_file.exists():
        logger.warning(f"Annotations file not found: {annotations_file}")
        return []
    
    try:
        with open(annotations_file, 'r', encoding='utf-8') as f:
            return json.load(f)
    except Exception as e:
        logger.error(f"Failed to load annotations: {e}")
        return []


def save_annotations(annotations: List[Dict[str, Any]], annotations_file: Path = Path("data/annotations.json")):
    """Save annotations to JSON file."""
    annotations_file.parent.mkdir(parents=True, exist_ok=True)
    
    with open(annotations_file, 'w', encoding='utf-8') as f:
        json.dump(annotations, f, indent=2, ensure_ascii=False)
    
    logger.info(f"Saved {len(annotations)} annotations to {annotations_file}")


def get_annotation_statistics(annotations: List[Dict[str, Any]]) -> Dict[str, Any]:
    """Calculate statistics from annotations."""
    total_samples = len(annotations)
    total_errors = sum(a.get('total_errors', 0) for a in annotations)
    
    error_types = {
        'substitution': 0,
        'omission': 0,
        'distortion': 0,
        'stutter': 0,
        'normal': 0
    }
    
    phoneme_errors = {}
    
    for ann in annotations:
        for err in ann.get('phoneme_errors', []):
            err_type = err.get('error_type', 'normal')
            error_types[err_type] = error_types.get(err_type, 0) + 1
            
            phoneme = err.get('phoneme', 'unknown')
            if phoneme not in phoneme_errors:
                phoneme_errors[phoneme] = 0
            phoneme_errors[phoneme] += 1
    
    return {
        'total_samples': total_samples,
        'total_errors': total_errors,
        'error_types': error_types,
        'phoneme_errors': phoneme_errors,
        'avg_errors_per_sample': total_errors / total_samples if total_samples > 0 else 0.0
    }


def export_for_training(
    annotations: List[Dict[str, Any]],
    output_file: Path = Path("data/training_dataset.json")
) -> Dict[str, Any]:
    """Export annotations in training-ready format."""
    training_data = []
    
    for ann in annotations:
        audio_file = ann.get('audio_file')
        expected_text = ann.get('expected_text', '')
        duration = ann.get('duration', 0.0)
        
        # Create frame-level labels
        num_frames = int((duration * 1000) / 20)  # 20ms frames
        frame_labels = [0] * num_frames  # 0 = normal
        
        # Map errors to frames
        for err in ann.get('phoneme_errors', []):
            frame_id = err.get('frame_id', 0)
            err_type = err.get('error_type', 'normal')
            
            # Map to 8-class system
            class_id = {
                'normal': 0,
                'substitution': 1,
                'omission': 2,
                'distortion': 3,
                'stutter': 4
            }.get(err_type, 0)
            
            # Check if stutter + articulation error
            if err_type != 'normal' and err_type != 'stutter':
                # Check if there's also stutter
                if any(e.get('error_type') == 'stutter' for e in ann.get('phoneme_errors', []) 
                       if e.get('frame_id') == frame_id):
                    class_id += 4  # Add 4 for stutter classes (5-7)
            
            if 0 <= frame_id < num_frames:
                frame_labels[frame_id] = class_id
        
        training_data.append({
            'audio_file': audio_file,
            'expected_text': expected_text,
            'duration': duration,
            'num_frames': num_frames,
            'frame_labels': frame_labels,
            'phoneme_errors': ann.get('phoneme_errors', [])
        })
    
    output_file.parent.mkdir(parents=True, exist_ok=True)
    with open(output_file, 'w', encoding='utf-8') as f:
        json.dump(training_data, f, indent=2, ensure_ascii=False)
    
    logger.info(f"Exported {len(training_data)} samples for training to {output_file}")
    
    return {
        'samples': len(training_data),
        'output_file': str(output_file)
    }


if __name__ == "__main__":
    # Example usage
    annotations = load_annotations()
    stats = get_annotation_statistics(annotations)
    
    print(f"Total samples: {stats['total_samples']}")
    print(f"Total errors: {stats['total_errors']}")
    print(f"Error types: {stats['error_types']}")
    
    if annotations:
        export_for_training(annotations)