Spaces:

anfastech
/

zlaqa-version-c-ai-enginee

Sleeping

File size: 55,632 Bytes

e08baf1

# diagnosis/ai_engine/detect_stuttering.py
import os
import librosa
import torch
import logging
import numpy as np
from transformers import Wav2Vec2ForCTC, AutoProcessor
import time
from dataclasses import dataclass, field
from typing import List, Dict, Any, Tuple, Optional
from difflib import SequenceMatcher
import re
# Advanced similarity and distance metrics
from scipy.spatial.distance import cosine, euclidean
from scipy.stats import pearsonr

logger = logging.getLogger(__name__)

# === CONFIGURATION ===
MODEL_ID = "ai4bharat/indicwav2vec-hindi"  # Only model used - IndicWav2Vec Hindi for ASR
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
HF_TOKEN = os.getenv("HF_TOKEN")  # Hugging Face token for authenticated model access

INDIAN_LANGUAGES = {
    'hindi': 'hin', 'english': 'eng', 'tamil': 'tam', 'telugu': 'tel',
    'bengali': 'ben', 'marathi': 'mar', 'gujarati': 'guj', 'kannada': 'kan',
    'malayalam': 'mal', 'punjabi': 'pan', 'urdu': 'urd', 'assamese': 'asm',
    'odia': 'ory', 'bhojpuri': 'bho', 'maithili': 'mai'
}

# === DEVANAGARI PHONETIC MAPPINGS (Research-Based) ===
# Consonants grouped by phonetic similarity for stutter detection
DEVANAGARI_CONSONANT_GROUPS = {
    # Plosives (stops)
    'velar': ['क', 'ख', 'ग', 'घ', 'ङ'],
    'palatal': ['च', 'छ', 'ज', 'झ', 'ञ'],
    'retroflex': ['ट', 'ठ', 'ड', 'ढ', 'ण'],
    'dental': ['त', 'थ', 'द', 'ध', 'न'],
    'labial': ['प', 'फ', 'ब', 'भ', 'म'],
    # Fricatives & Approximants
    'sibilants': ['श', 'ष', 'स', 'ह'],
    'liquids': ['र', 'ल', 'ळ'],
    'semivowels': ['य', 'व'],
}

# Vowels grouped by phonetic features
DEVANAGARI_VOWEL_GROUPS = {
    'short': ['अ', 'इ', 'उ', 'ऋ'],
    'long': ['आ', 'ई', 'ऊ', 'ॠ'],
    'diphthongs': ['ए', 'ऐ', 'ओ', 'औ'],
}

# Common Hindi stutter patterns (research-based)
HINDI_STUTTER_PATTERNS = {
    'repetition': [r'(.)\1{2,}', r'(\w+)\s+\1', r'(\w)\s+\1'],  # Character/word repetition
    'prolongation': [r'(.)\1{3,}', r'[आईऊएओ]{2,}'],  # Extended vowels
    'filled_pause': ['अ', 'उ', 'ए', 'म', 'उम', 'आ'],  # Hesitation sounds
}

# === RESEARCH-BASED THRESHOLDS (2024-2025 Literature) ===
# Prolongation Detection (Spectral Correlation + Duration)
PROLONGATION_CORRELATION_THRESHOLD = 0.90  # >0.9 spectral similarity
PROLONGATION_MIN_DURATION = 0.25  # >250ms (Revisiting Rule-Based, 2025)

# Block Detection (Silence Analysis)
BLOCK_SILENCE_THRESHOLD = 0.35  # >350ms silence mid-utterance
BLOCK_ENERGY_PERCENTILE = 10  # Bottom 10% energy = silence

# Repetition Detection (DTW + Text Matching)
REPETITION_DTW_THRESHOLD = 0.15  # Normalized DTW distance
REPETITION_MIN_SIMILARITY = 0.85  # Text-based similarity

# Speaking Rate Norms (syllables/second)
SPEECH_RATE_MIN = 2.0
SPEECH_RATE_MAX = 6.0
SPEECH_RATE_TYPICAL = 4.0

# Formant Analysis (Vowel Centralization - Research Finding)
# People who stutter show reduced vowel space area
VOWEL_SPACE_REDUCTION_THRESHOLD = 0.70  # 70% of typical area

# Voice Quality (Jitter, Shimmer, HNR)
JITTER_THRESHOLD = 0.01  # >1% jitter indicates instability
SHIMMER_THRESHOLD = 0.03  # >3% shimmer
HNR_THRESHOLD = 15.0  # <15 dB Harmonics-to-Noise Ratio

# Zero-Crossing Rate (Voiced/Unvoiced Discrimination)
ZCR_VOICED_THRESHOLD = 0.1  # Low ZCR = voiced
ZCR_UNVOICED_THRESHOLD = 0.3  # High ZCR = unvoiced

# Entropy-Based Uncertainty
ENTROPY_HIGH_THRESHOLD = 3.5  # High confusion in model predictions
CONFIDENCE_LOW_THRESHOLD = 0.40  # Low confidence frame threshold

@dataclass
class StutterEvent:
    """Enhanced stutter event with multi-modal features"""
    type: str  # 'repetition', 'prolongation', 'block', 'dysfluency', 'mismatch'
    start: float
    end: float
    text: str
    confidence: float
    acoustic_features: Dict[str, float] = field(default_factory=dict)
    voice_quality: Dict[str, float] = field(default_factory=dict)
    formant_data: Dict[str, Any] = field(default_factory=dict)
    phonetic_similarity: float = 0.0  # For comparing expected vs actual sounds


class AdvancedStutterDetector:
    """
    🎤 IndicWav2Vec Hindi ASR Engine
    
    Simplified engine using ONLY ai4bharat/indicwav2vec-hindi for Automatic Speech Recognition.
    
    Features:
    - Speech-to-text transcription using IndicWav2Vec Hindi model
    - Text-based stutter analysis from transcription
    - Confidence scoring from model predictions
    - Basic dysfluency detection from transcript patterns
    
    Model: ai4bharat/indicwav2vec-hindi (Wav2Vec2ForCTC)
    Purpose: Automatic Speech Recognition (ASR) for Hindi and Indian languages
    """

    def __init__(self):
        logger.info(f"🚀 Initializing Advanced AI Engine on {DEVICE}...")
        if HF_TOKEN:
            logger.info("✅ HF_TOKEN found - using authenticated model access")
        else:
            logger.warning("⚠️ HF_TOKEN not found - model access may fail if authentication is required")
        try:
            # Wav2Vec2 Model Loading - IndicWav2Vec Hindi Model
            self.processor = AutoProcessor.from_pretrained(
                MODEL_ID,
                token=HF_TOKEN
            )
            self.model = Wav2Vec2ForCTC.from_pretrained(
                MODEL_ID,
                token=HF_TOKEN,
                torch_dtype=torch.float16 if DEVICE == "cuda" else torch.float32
            ).to(DEVICE)
            self.model.eval()
            
            # Initialize feature extractor (clean architecture pattern)
            from .features import ASRFeatureExtractor
            self.feature_extractor = ASRFeatureExtractor(
                model=self.model,
                processor=self.processor,
                device=DEVICE
            )
            
            # Debug: Log processor structure
            logger.info(f"📋 Processor type: {type(self.processor)}")
            if hasattr(self.processor, 'tokenizer'):
                logger.info(f"📋 Tokenizer type: {type(self.processor.tokenizer)}")
            if hasattr(self.processor, 'feature_extractor'):
                logger.info(f"📋 Feature extractor type: {type(self.processor.feature_extractor)}")

            logger.info("✅ IndicWav2Vec Hindi ASR Engine Loaded with Feature Extractor")
        except Exception as e:
            logger.error(f"🔥 Engine Failure: {e}")
            raise

    def _init_common_adapters(self):
        """Not applicable - IndicWav2Vec Hindi doesn't use adapters"""
        pass

    def _activate_adapter(self, lang_code: str):
        """Not applicable - IndicWav2Vec Hindi doesn't use adapters"""
        logger.info(f"Using IndicWav2Vec Hindi model (optimized for Hindi)")
        pass

    # ===== LEGACY METHODS (NOT USED IN ASR-ONLY MODE) =====
    # These methods are kept for reference but not called in the simplified ASR pipeline
    # They require additional libraries (parselmouth, fastdtw, sklearn) that are not needed for ASR-only mode
    
    def _extract_comprehensive_features(self, audio: np.ndarray, sr: int, audio_path: str) -> Dict[str, Any]:
        """Extract multi-modal acoustic features"""
        features = {}
        
        # MFCC (20 coefficients)
        mfcc = librosa.feature.mfcc(y=audio, sr=sr, n_mfcc=20, hop_length=512)
        features['mfcc'] = mfcc.T  # Transpose for time x features
        
        # Zero-Crossing Rate
        zcr = librosa.feature.zero_crossing_rate(audio, hop_length=512)[0]
        features['zcr'] = zcr
        
        # RMS Energy
        rms_energy = librosa.feature.rms(y=audio, hop_length=512)[0]
        features['rms_energy'] = rms_energy
        
        # Spectral Flux
        stft = librosa.stft(audio, hop_length=512)
        magnitude = np.abs(stft)
        spectral_flux = np.sum(np.diff(magnitude, axis=1) * (np.diff(magnitude, axis=1) > 0), axis=0)
        features['spectral_flux'] = spectral_flux
        
        # Energy Entropy
        frame_energy = np.sum(magnitude ** 2, axis=0)
        frame_energy = frame_energy + 1e-10  # Avoid log(0)
        energy_entropy = -np.sum((magnitude ** 2 / frame_energy) * np.log(magnitude ** 2 / frame_energy + 1e-10), axis=0)
        features['energy_entropy'] = energy_entropy
        
        # Formant Analysis using Parselmouth
        try:
            sound = parselmouth.Sound(audio_path)
            formant = sound.to_formant_burg(time_step=0.01)
            times = np.arange(0, sound.duration, 0.01)
            f1, f2, f3, f4 = [], [], [], []
            
            for t in times:
                try:
                    f1.append(formant.get_value_at_time(1, t) if formant.get_value_at_time(1, t) > 0 else np.nan)
                    f2.append(formant.get_value_at_time(2, t) if formant.get_value_at_time(2, t) > 0 else np.nan)
                    f3.append(formant.get_value_at_time(3, t) if formant.get_value_at_time(3, t) > 0 else np.nan)
                    f4.append(formant.get_value_at_time(4, t) if formant.get_value_at_time(4, t) > 0 else np.nan)
                except:
                    f1.append(np.nan)
                    f2.append(np.nan)
                    f3.append(np.nan)
                    f4.append(np.nan)
            
            formants = np.array([f1, f2, f3, f4]).T
            features['formants'] = formants
            
            # Calculate vowel space area (F1-F2 plane)
            valid_f1f2 = formants[~np.isnan(formants[:, 0]) & ~np.isnan(formants[:, 1]), :2]
            if len(valid_f1f2) > 0:
                # Convex hull area approximation
                try:
                    hull = ConvexHull(valid_f1f2)
                    vowel_space_area = hull.volume
                except:
                    vowel_space_area = np.nan
            else:
                vowel_space_area = np.nan
            
            features['formant_summary'] = {
                'vowel_space_area': float(vowel_space_area) if not np.isnan(vowel_space_area) else 0.0,
                'f1_mean': float(np.nanmean(f1)) if len(f1) > 0 else 0.0,
                'f2_mean': float(np.nanmean(f2)) if len(f2) > 0 else 0.0,
                'f1_std': float(np.nanstd(f1)) if len(f1) > 0 else 0.0,
                'f2_std': float(np.nanstd(f2)) if len(f2) > 0 else 0.0
            }
        except Exception as e:
            logger.warning(f"Formant analysis failed: {e}")
            features['formants'] = np.zeros((len(audio) // 100, 4))
            features['formant_summary'] = {
                'vowel_space_area': 0.0,
                'f1_mean': 0.0, 'f2_mean': 0.0,
                'f1_std': 0.0, 'f2_std': 0.0
            }
        
        # Voice Quality Metrics (Jitter, Shimmer, HNR)
        try:
            sound = parselmouth.Sound(audio_path)
            pitch = sound.to_pitch()
            point_process = parselmouth.praat.call([sound, pitch], "To PointProcess")
            
            jitter = parselmouth.praat.call(point_process, "Get jitter (local)", 0.0, 0.0, 1.1, 1.6, 1.3, 1.6)
            shimmer = parselmouth.praat.call([sound, point_process], "Get shimmer (local)", 0.0, 0.0, 0.0001, 0.02, 1.3, 1.6)
            hnr = parselmouth.praat.call(sound, "Get harmonicity (cc)", 0.0, 0.0, 0.01, 1.5, 1.0, 0.1, 1.0)
            
            features['voice_quality'] = {
                'jitter': float(jitter) if jitter is not None else 0.0,
                'shimmer': float(shimmer) if shimmer is not None else 0.0,
                'hnr_db': float(hnr) if hnr is not None else 20.0
            }
        except Exception as e:
            logger.warning(f"Voice quality analysis failed: {e}")
            features['voice_quality'] = {
                'jitter': 0.0,
                'shimmer': 0.0,
                'hnr_db': 20.0
            }
        
        return features

    def _transcribe_with_timestamps(self, audio: np.ndarray) -> Tuple[str, List[Dict], torch.Tensor]:
        """
        Transcribe audio and return word timestamps and logits.
        
        Uses the feature extractor for clean separation of concerns.
        """
        try:
            # Use feature extractor for transcription (clean architecture)
            features = self.feature_extractor.get_transcription_features(audio, sample_rate=16000)
            transcript = features['transcript']
            logits = torch.from_numpy(features['logits'])
            
            # Get word-level features for timestamps
            word_features = self.feature_extractor.get_word_level_features(audio, sample_rate=16000)
            word_timestamps = word_features['word_timestamps']
            
            logger.info(f"📝 Transcription via feature extractor: '{transcript}' (length: {len(transcript)}, words: {len(word_timestamps)})")
            
            return transcript, word_timestamps, logits
        except Exception as e:
            logger.error(f"❌ Transcription failed: {e}", exc_info=True)
            return "", [], torch.zeros((1, 100, 32))  # Dummy return

    def _calculate_uncertainty(self, logits: torch.Tensor) -> Tuple[float, List[Dict]]:
        """Calculate entropy-based uncertainty and low-confidence regions"""
        try:
            probs = torch.softmax(logits, dim=-1)
            entropy = -torch.sum(probs * torch.log(probs + 1e-10), dim=-1)
            entropy_mean = float(torch.mean(entropy).item())
            
            # Find low-confidence regions
            frame_duration = 0.02
            low_conf_regions = []
            confidence = torch.max(probs, dim=-1)[0]
            
            for i in range(confidence.shape[1]):
                conf = float(confidence[0, i].item())
                if conf < CONFIDENCE_LOW_THRESHOLD:
                    low_conf_regions.append({
                        'time': i * frame_duration,
                        'confidence': conf
                    })
            
            return entropy_mean, low_conf_regions
        except Exception as e:
            logger.warning(f"Uncertainty calculation failed: {e}")
            return 0.0, []

    def _estimate_speaking_rate(self, audio: np.ndarray, sr: int) -> float:
        """Estimate speaking rate in syllables per second"""
        try:
            # Simple syllable estimation using energy peaks
            rms = librosa.feature.rms(y=audio, hop_length=512)[0]
            peaks, _ = librosa.util.peak_pick(rms, pre_max=3, post_max=3, pre_avg=3, post_avg=5, delta=0.1, wait=10)
            
            duration = len(audio) / sr
            num_syllables = len(peaks)
            speaking_rate = num_syllables / duration if duration > 0 else SPEECH_RATE_TYPICAL
            
            return max(SPEECH_RATE_MIN, min(SPEECH_RATE_MAX, speaking_rate))
        except Exception as e:
            logger.warning(f"Speaking rate estimation failed: {e}")
            return SPEECH_RATE_TYPICAL

    def _detect_prolongations_advanced(self, mfcc: np.ndarray, spectral_flux: np.ndarray, 
                                      speaking_rate: float, word_timestamps: List[Dict]) -> List[StutterEvent]:
        """Detect prolongations using spectral correlation"""
        events = []
        frame_duration = 0.02
        
        # Adaptive threshold based on speaking rate
        min_duration = PROLONGATION_MIN_DURATION * (SPEECH_RATE_TYPICAL / max(speaking_rate, 0.1))
        
        window_size = int(min_duration / frame_duration)
        if window_size < 2:
            return events
        
        for i in range(len(mfcc) - window_size):
            window = mfcc[i:i+window_size]
            
            # Calculate spectral correlation
            if len(window) > 1:
                corr_matrix = np.corrcoef(window.T)
                avg_correlation = np.mean(corr_matrix[np.triu_indices_from(corr_matrix, k=1)])
                
                if avg_correlation > PROLONGATION_CORRELATION_THRESHOLD:
                    start_time = i * frame_duration
                    end_time = (i + window_size) * frame_duration
                    
                    # Check if within a word boundary
                    for word_ts in word_timestamps:
                        if word_ts['start'] <= start_time <= word_ts['end']:
                            events.append(StutterEvent(
                                type='prolongation',
                                start=start_time,
                                end=end_time,
                                text=word_ts.get('word', ''),
                                confidence=float(avg_correlation),
                                acoustic_features={
                                    'spectral_correlation': float(avg_correlation),
                                    'duration': end_time - start_time
                                }
                            ))
                            break
        
        return events

    def _detect_blocks_enhanced(self, audio: np.ndarray, sr: int, rms_energy: np.ndarray,
                               zcr: np.ndarray, word_timestamps: List[Dict], 
                               speaking_rate: float) -> List[StutterEvent]:
        """Detect blocks using silence analysis"""
        events = []
        frame_duration = 0.02
        
        # Adaptive threshold
        silence_threshold = BLOCK_SILENCE_THRESHOLD * (SPEECH_RATE_TYPICAL / max(speaking_rate, 0.1))
        energy_threshold = np.percentile(rms_energy, BLOCK_ENERGY_PERCENTILE)
        
        in_silence = False
        silence_start = 0
        
        for i, energy in enumerate(rms_energy):
            is_silent = energy < energy_threshold and zcr[i] < ZCR_VOICED_THRESHOLD
            
            if is_silent and not in_silence:
                silence_start = i * frame_duration
                in_silence = True
            elif not is_silent and in_silence:
                silence_duration = (i * frame_duration) - silence_start
                if silence_duration > silence_threshold:
                    # Check if mid-utterance (not at start/end)
                    audio_duration = len(audio) / sr
                    if silence_start > 0.1 and silence_start < audio_duration - 0.1:
                        events.append(StutterEvent(
                            type='block',
                            start=silence_start,
                            end=i * frame_duration,
                            text="<silence>",
                            confidence=0.8,
                            acoustic_features={
                                'silence_duration': silence_duration,
                                'energy_level': float(energy)
                            }
                        ))
                in_silence = False
        
        return events

    def _detect_repetitions_advanced(self, mfcc: np.ndarray, formants: np.ndarray,
                                    word_timestamps: List[Dict], transcript: str,
                                    speaking_rate: float) -> List[StutterEvent]:
        """Detect repetitions using DTW and text matching"""
        events = []
        
        if len(word_timestamps) < 2:
            return events
        
        # Text-based repetition detection
        words = transcript.lower().split()
        for i in range(len(words) - 1):
            if words[i] == words[i+1]:
                # Find corresponding timestamps
                if i < len(word_timestamps) and i+1 < len(word_timestamps):
                    start = word_timestamps[i]['start']
                    end = word_timestamps[i+1]['end']
                    
                    # DTW verification on MFCC
                    start_frame = int(start / 0.02)
                    mid_frame = int((start + end) / 2 / 0.02)
                    end_frame = int(end / 0.02)
                    
                    if start_frame < len(mfcc) and end_frame < len(mfcc):
                        segment1 = mfcc[start_frame:mid_frame]
                        segment2 = mfcc[mid_frame:end_frame]
                        
                        if len(segment1) > 0 and len(segment2) > 0:
                            try:
                                distance, _ = fastdtw(segment1, segment2)
                                normalized_distance = distance / max(len(segment1), len(segment2))
                                
                                if normalized_distance < REPETITION_DTW_THRESHOLD:
                                    events.append(StutterEvent(
                                        type='repetition',
                                        start=start,
                                        end=end,
                                        text=words[i],
                                        confidence=1.0 - normalized_distance,
                                        acoustic_features={
                                            'dtw_distance': float(normalized_distance),
                                            'repetition_count': 2
                                        }
                                    ))
                            except:
                                pass
        
        return events

    def _detect_voice_quality_issues(self, audio_path: str, word_timestamps: List[Dict],
                                    voice_quality: Dict[str, float]) -> List[StutterEvent]:
        """Detect dysfluencies based on voice quality metrics"""
        events = []
        
        # Global voice quality issues
        if voice_quality.get('jitter', 0) > JITTER_THRESHOLD or \
           voice_quality.get('shimmer', 0) > SHIMMER_THRESHOLD or \
           voice_quality.get('hnr_db', 20) < HNR_THRESHOLD:
            
            # Mark regions with poor voice quality
            for word_ts in word_timestamps:
                if word_ts.get('start', 0) > 0:  # Skip first word
                    events.append(StutterEvent(
                        type='dysfluency',
                        start=word_ts['start'],
                        end=word_ts['end'],
                        text=word_ts.get('word', ''),
                        confidence=0.6,
                        voice_quality=voice_quality.copy()
                    ))
                    break  # Only mark first occurrence
        
        return events

    def _is_overlapping(self, time: float, events: List[StutterEvent], threshold: float = 0.1) -> bool:
        """Check if time overlaps with existing events"""
        for event in events:
            if event.start - threshold <= time <= event.end + threshold:
                return True
        return False

    def _detect_anomalies(self, events: List[StutterEvent], features: Dict[str, Any]) -> List[StutterEvent]:
        """Use Isolation Forest to filter anomalous events"""
        if len(events) == 0:
            return events
        
        try:
            # Extract features for anomaly detection
            X = []
            for event in events:
                feat_vec = [
                    event.end - event.start,  # Duration
                    event.confidence,
                    features.get('voice_quality', {}).get('jitter', 0),
                    features.get('voice_quality', {}).get('shimmer', 0)
                ]
                X.append(feat_vec)
            
            X = np.array(X)
            if len(X) > 1:
                self.anomaly_detector.fit(X)
                predictions = self.anomaly_detector.predict(X)
                
                # Keep only non-anomalous events (predictions == 1)
                filtered_events = [events[i] for i, pred in enumerate(predictions) if pred == 1]
                return filtered_events
        except Exception as e:
            logger.warning(f"Anomaly detection failed: {e}")
        
        return events

    def _deduplicate_events_cascade(self, events: List[StutterEvent]) -> List[StutterEvent]:
        """Remove overlapping events with priority: Block > Repetition > Prolongation > Dysfluency"""
        if len(events) == 0:
            return events
        
        # Sort by priority and start time
        priority = {'block': 4, 'repetition': 3, 'prolongation': 2, 'dysfluency': 1}
        events.sort(key=lambda e: (priority.get(e.type, 0), e.start), reverse=True)
        
        cleaned = []
        for event in events:
            overlap = False
            for existing in cleaned:
                # Check overlap
                if not (event.end < existing.start or event.start > existing.end):
                    overlap = True
                    break
            
            if not overlap:
                cleaned.append(event)
        
        # Sort by start time
        cleaned.sort(key=lambda e: e.start)
        return cleaned

    def _calculate_clinical_metrics(self, events: List[StutterEvent], duration: float,
                                    speaking_rate: float, features: Dict[str, Any]) -> Dict[str, Any]:
        """Calculate comprehensive clinical metrics"""
        total_duration = sum(e.end - e.start for e in events)
        frequency = (len(events) / duration * 60) if duration > 0 else 0
        
        # Calculate severity score (0-100)
        stutter_percentage = (total_duration / duration * 100) if duration > 0 else 0
        frequency_score = min(frequency / 10 * 100, 100)  # Normalize to 100
        severity_score = (stutter_percentage * 0.6 + frequency_score * 0.4)
        
        # Determine severity label
        if severity_score < 10:
            severity_label = 'none'
        elif severity_score < 25:
            severity_label = 'mild'
        elif severity_score < 50:
            severity_label = 'moderate'
        else:
            severity_label = 'severe'
        
        # Calculate confidence based on multiple factors
        voice_quality = features.get('voice_quality', {})
        confidence = 0.8  # Base confidence
        
        # Adjust based on voice quality metrics
        if voice_quality.get('jitter', 0) > JITTER_THRESHOLD:
            confidence -= 0.1
        if voice_quality.get('shimmer', 0) > SHIMMER_THRESHOLD:
            confidence -= 0.1
        if voice_quality.get('hnr_db', 20) < HNR_THRESHOLD:
            confidence -= 0.1
        
        confidence = max(0.3, min(1.0, confidence))
        
        return {
            'total_duration': round(total_duration, 2),
            'frequency': round(frequency, 2),
            'severity_score': round(severity_score, 2),
            'severity_label': severity_label,
            'confidence': round(confidence, 2)
        }

    def _event_to_dict(self, event: StutterEvent) -> Dict[str, Any]:
        """Convert StutterEvent to dictionary"""
        return {
            'type': event.type,
            'start': round(event.start, 2),
            'end': round(event.end, 2),
            'text': event.text,
            'confidence': round(event.confidence, 2),
            'acoustic_features': event.acoustic_features,
            'voice_quality': event.voice_quality,
            'formant_data': event.formant_data,
            'phonetic_similarity': round(event.phonetic_similarity, 2)
        }
    
    # ========== ADVANCED TRANSCRIPT COMPARISON METHODS ==========
    
    def _get_phonetic_group(self, char: str) -> Optional[str]:
        """Get phonetic group for a Devanagari character"""
        for group_name, chars in DEVANAGARI_CONSONANT_GROUPS.items():
            if char in chars:
                return f'consonant_{group_name}'
        for group_name, chars in DEVANAGARI_VOWEL_GROUPS.items():
            if char in chars:
                return f'vowel_{group_name}'
        return None
    
    def _calculate_phonetic_similarity(self, char1: str, char2: str) -> float:
        """
        Calculate phonetic similarity between two characters (0-1)
        Based on articulatory phonetics research
        """
        if char1 == char2:
            return 1.0
        
        # Get phonetic groups
        group1 = self._get_phonetic_group(char1)
        group2 = self._get_phonetic_group(char2)
        
        if group1 is None or group2 is None:
            # Non-Devanagari characters - use simple comparison
            return 1.0 if char1.lower() == char2.lower() else 0.0
        
        # Same phonetic group = high similarity (common in stuttering)
        if group1 == group2:
            return 0.85  # e.g., क vs ख (both velar)
        
        # Same major category (both consonants or both vowels)
        if group1.split('_')[0] == group2.split('_')[0]:
            return 0.5  # e.g., क (velar) vs च (palatal)
        
        # Different categories
        return 0.2
    
    def _longest_common_subsequence(self, text1: str, text2: str) -> str:
        """
        Find longest common subsequence (LCS) using dynamic programming
        Critical for identifying core message vs stuttered additions
        """
        m, n = len(text1), len(text2)
        dp = [[0] * (n + 1) for _ in range(m + 1)]
        
        # Build DP table
        for i in range(1, m + 1):
            for j in range(1, n + 1):
                if text1[i-1] == text2[j-1]:
                    dp[i][j] = dp[i-1][j-1] + 1
                else:
                    dp[i][j] = max(dp[i-1][j], dp[i][j-1])
        
        # Backtrack to construct LCS
        lcs = []
        i, j = m, n
        while i > 0 and j > 0:
            if text1[i-1] == text2[j-1]:
                lcs.append(text1[i-1])
                i -= 1
                j -= 1
            elif dp[i-1][j] > dp[i][j-1]:
                i -= 1
            else:
                j -= 1
        
        return ''.join(reversed(lcs))
    
    def _calculate_edit_distance(self, text1: str, text2: str, phonetic_aware: bool = True) -> Tuple[int, List[Dict]]:
        """
        Calculate Levenshtein edit distance with phonetic awareness
        Returns: (distance, list of edit operations)
        """
        m, n = len(text1), len(text2)
        dp = [[0] * (n + 1) for _ in range(m + 1)]
        ops = [[[] for _ in range(n + 1)] for _ in range(m + 1)]
        
        # Initialize
        for i in range(m + 1):
            dp[i][0] = i
            if i > 0:
                ops[i][0] = ops[i-1][0] + [{'op': 'delete', 'pos': i-1, 'char': text1[i-1]}]
        for j in range(n + 1):
            dp[0][j] = j
            if j > 0:
                ops[0][j] = ops[0][j-1] + [{'op': 'insert', 'pos': j-1, 'char': text2[j-1]}]
        
        # Fill DP table with phonetic costs
        for i in range(1, m + 1):
            for j in range(1, n + 1):
                if text1[i-1] == text2[j-1]:
                    # Exact match - no cost
                    dp[i][j] = dp[i-1][j-1]
                    ops[i][j] = ops[i-1][j-1]
                else:
                    # Calculate phonetic substitution cost
                    if phonetic_aware:
                        phon_sim = self._calculate_phonetic_similarity(text1[i-1], text2[j-1])
                        sub_cost = 1.0 - (phon_sim * 0.5)  # 0.5-1.0 range
                    else:
                        sub_cost = 1.0
                    
                    # Choose minimum cost operation
                    costs = [
                        dp[i-1][j] + 1,  # Delete
                        dp[i][j-1] + 1,  # Insert
                        dp[i-1][j-1] + sub_cost  # Substitute
                    ]
                    min_cost_idx = costs.index(min(costs))
                    dp[i][j] = costs[min_cost_idx]
                    
                    if min_cost_idx == 0:
                        ops[i][j] = ops[i-1][j] + [{'op': 'delete', 'pos': i-1, 'char': text1[i-1]}]
                    elif min_cost_idx == 1:
                        ops[i][j] = ops[i][j-1] + [{'op': 'insert', 'pos': j-1, 'char': text2[j-1]}]
                    else:
                        ops[i][j] = ops[i-1][j-1] + [{'op': 'substitute', 'pos': i-1, 
                                                      'from': text1[i-1], 'to': text2[j-1],
                                                      'phonetic_sim': phon_sim if phonetic_aware else 0}]
        
        return int(dp[m][n]), ops[m][n]
    
    def _find_mismatched_segments(self, actual: str, target: str) -> List[str]:
        """
        Find character sequences in actual that don't appear in target
        Uses LCS to identify core message, then extracts mismatches
        """
        if not actual or not target:
            return [actual] if actual else []
        
        lcs = self._longest_common_subsequence(actual, target)
        
        # Extract segments not in LCS
        mismatched_segments = []
        segment = ""
        lcs_idx = 0
        
        for char in actual:
            if lcs_idx < len(lcs) and char == lcs[lcs_idx]:
                if segment:
                    mismatched_segments.append(segment)
                    segment = ""
                lcs_idx += 1
            else:
                segment += char
        
        if segment:
            mismatched_segments.append(segment)
        
        return mismatched_segments
    
    def _detect_stutter_patterns_in_text(self, text: str) -> List[Dict[str, Any]]:
        """
        Detect common Hindi stutter patterns in text
        Based on linguistic research on Hindi dysfluencies
        """
        patterns_found = []
        
        # Detect repetitions
        for pattern in HINDI_STUTTER_PATTERNS['repetition']:
            matches = re.finditer(pattern, text)
            for match in matches:
                patterns_found.append({
                    'type': 'repetition',
                    'text': match.group(0),
                    'position': match.start(),
                    'pattern': pattern
                })
        
        # Detect prolongations
        for pattern in HINDI_STUTTER_PATTERNS['prolongation']:
            matches = re.finditer(pattern, text)
            for match in matches:
                patterns_found.append({
                    'type': 'prolongation',
                    'text': match.group(0),
                    'position': match.start(),
                    'pattern': pattern
                })
        
        # Detect filled pauses
        words = text.split()
        for i, word in enumerate(words):
            if word in HINDI_STUTTER_PATTERNS['filled_pause']:
                patterns_found.append({
                    'type': 'filled_pause',
                    'text': word,
                    'position': i,
                    'pattern': 'hesitation'
                })
        
        return patterns_found
    
    def _compare_transcripts_comprehensive(self, actual: str, target: str) -> Dict[str, Any]:
        """
        Comprehensive transcript comparison with multiple metrics
        Returns detailed analysis including phonetic, structural, and acoustic mismatches
        """
        if not target:
            # No target provided - only analyze actual for stutter patterns
            stutter_patterns = self._detect_stutter_patterns_in_text(actual)
            return {
                'has_target': False,
                'mismatched_chars': [],
                'mismatch_percentage': 0,
                'edit_distance': 0,
                'lcs_ratio': 1.0,
                'phonetic_similarity': 1.0,
                'stutter_patterns': stutter_patterns,
                'edit_operations': []
            }
        
        # Normalize whitespace
        actual = ' '.join(actual.split())
        target = ' '.join(target.split())
        
        # 1. Find mismatched character segments
        mismatched_segments = self._find_mismatched_segments(actual, target)
        
        # 2. Calculate edit distance with phonetic awareness
        edit_dist, edit_ops = self._calculate_edit_distance(actual, target, phonetic_aware=True)
        
        # 3. Calculate LCS ratio (similarity measure)
        lcs = self._longest_common_subsequence(actual, target)
        lcs_ratio = len(lcs) / max(len(target), 1)
        
        # 4. Calculate overall phonetic similarity
        phonetic_scores = []
        matcher = SequenceMatcher(None, actual, target)
        for tag, i1, i2, j1, j2 in matcher.get_opcodes():
            if tag == 'equal':
                phonetic_scores.append(1.0)
            elif tag == 'replace':
                # Calculate phonetic similarity for replacements
                for a_char, t_char in zip(actual[i1:i2], target[j1:j2]):
                    phonetic_scores.append(self._calculate_phonetic_similarity(a_char, t_char))
        
        avg_phonetic_sim = np.mean(phonetic_scores) if phonetic_scores else 0.0
        
        # 5. Calculate mismatch percentage (characters not in target)
        total_mismatched = sum(len(seg) for seg in mismatched_segments)
        mismatch_percentage = (total_mismatched / max(len(target), 1)) * 100
        mismatch_percentage = min(round(mismatch_percentage), 100)
        
        # 6. Detect stutter patterns in actual transcript
        stutter_patterns = self._detect_stutter_patterns_in_text(actual)
        
        # 7. Word-level analysis
        actual_words = actual.split()
        target_words = target.split()
        word_matcher = SequenceMatcher(None, actual_words, target_words)
        word_accuracy = word_matcher.ratio()
        
        return {
            'has_target': True,
            'mismatched_chars': mismatched_segments,
            'mismatch_percentage': mismatch_percentage,
            'edit_distance': edit_dist,
            'normalized_edit_distance': edit_dist / max(len(target), 1),
            'lcs': lcs,
            'lcs_ratio': round(lcs_ratio, 3),
            'phonetic_similarity': round(float(avg_phonetic_sim), 3),
            'word_accuracy': round(word_accuracy, 3),
            'stutter_patterns': stutter_patterns,
            'edit_operations': edit_ops[:20],  # Limit for performance
            'actual_length': len(actual),
            'target_length': len(target),
            'actual_words': len(actual_words),
            'target_words': len(target_words)
        }
    
    # ========== ACOUSTIC SIMILARITY METHODS (SOUND-BASED MATCHING) ==========
    
    def _extract_mfcc_features(self, audio: np.ndarray, sr: int, n_mfcc: int = 13) -> np.ndarray:
        """Extract MFCC features for acoustic comparison"""
        mfcc = librosa.feature.mfcc(y=audio, sr=sr, n_mfcc=n_mfcc, hop_length=512)
        # Normalize
        mfcc = (mfcc - np.mean(mfcc, axis=1, keepdims=True)) / (np.std(mfcc, axis=1, keepdims=True) + 1e-8)
        return mfcc.T  # Time x Features
    
    def _calculate_dtw_distance(self, seq1: np.ndarray, seq2: np.ndarray) -> float:
        """
        Dynamic Time Warping distance for comparing audio segments
        Critical for detecting phonetic stutters where timing differs
        """
        n, m = len(seq1), len(seq2)
        dtw_matrix = np.full((n + 1, m + 1), np.inf)
        dtw_matrix[0, 0] = 0
        
        for i in range(1, n + 1):
            for j in range(1, m + 1):
                cost = euclidean(seq1[i-1], seq2[j-1])
                dtw_matrix[i, j] = cost + min(
                    dtw_matrix[i-1, j],      # Insertion
                    dtw_matrix[i, j-1],      # Deletion
                    dtw_matrix[i-1, j-1]     # Match
                )
        
        # Normalize by path length
        return dtw_matrix[n, m] / (n + m)
    
    def _compare_audio_segments_acoustic(self, segment1: np.ndarray, segment2: np.ndarray, 
                                        sr: int = 16000) -> Dict[str, float]:
        """
        Compare two audio segments acoustically using multiple metrics
        Used to detect when sounds are similar but transcripts differ (phonetic stutters)
        """
        # Extract MFCC features
        mfcc1 = self._extract_mfcc_features(segment1, sr)
        mfcc2 = self._extract_mfcc_features(segment2, sr)
        
        # 1. DTW distance
        dtw_dist = self._calculate_dtw_distance(mfcc1, mfcc2)
        dtw_similarity = max(0, 1.0 - (dtw_dist / 10))  # Normalize to 0-1
        
        # 2. Spectral features comparison
        spec1 = np.abs(librosa.stft(segment1))
        spec2 = np.abs(librosa.stft(segment2))
        
        # Resize to same shape for comparison
        min_frames = min(spec1.shape[1], spec2.shape[1])
        spec1 = spec1[:, :min_frames]
        spec2 = spec2[:, :min_frames]
        
        # Spectral correlation
        spec_corr = np.mean([pearsonr(spec1[:, i], spec2[:, i])[0] 
                            for i in range(min_frames) if not np.all(spec1[:, i] == 0) 
                            and not np.all(spec2[:, i] == 0)])
        spec_corr = max(0, spec_corr)  # Handle NaN/negative
        
        # 3. Energy comparison
        energy1 = np.sum(segment1 ** 2)
        energy2 = np.sum(segment2 ** 2)
        energy_ratio = min(energy1, energy2) / (max(energy1, energy2) + 1e-8)
        
        # 4. Zero-crossing rate comparison
        zcr1 = np.mean(librosa.feature.zero_crossing_rate(segment1)[0])
        zcr2 = np.mean(librosa.feature.zero_crossing_rate(segment2)[0])
        zcr_similarity = 1.0 - min(abs(zcr1 - zcr2) / (max(zcr1, zcr2) + 1e-8), 1.0)
        
        # Overall acoustic similarity (weighted average)
        overall_similarity = (
            dtw_similarity * 0.4 +
            spec_corr * 0.3 +
            energy_ratio * 0.15 +
            zcr_similarity * 0.15
        )
        
        return {
            'dtw_similarity': round(float(dtw_similarity), 3),
            'spectral_correlation': round(float(spec_corr), 3),
            'energy_ratio': round(float(energy_ratio), 3),
            'zcr_similarity': round(float(zcr_similarity), 3),
            'overall_acoustic_similarity': round(float(overall_similarity), 3)
        }
    
    def _detect_acoustic_repetitions(self, audio: np.ndarray, sr: int, 
                                    word_timestamps: List[Dict]) -> List[StutterEvent]:
        """
        Detect repetitions by comparing acoustic similarity between word segments
        Catches stutters even when ASR transcribes them differently
        """
        events = []
        
        if len(word_timestamps) < 2:
            return events
        
        # Compare consecutive words acoustically
        for i in range(len(word_timestamps) - 1):
            try:
                # Extract audio segments
                start1 = int(word_timestamps[i]['start'] * sr)
                end1 = int(word_timestamps[i]['end'] * sr)
                start2 = int(word_timestamps[i+1]['start'] * sr)
                end2 = int(word_timestamps[i+1]['end'] * sr)
                
                if end1 > len(audio) or end2 > len(audio):
                    continue
                
                segment1 = audio[start1:end1]
                segment2 = audio[start2:end2]
                
                if len(segment1) < 100 or len(segment2) < 100:  # Skip very short segments
                    continue
                
                # Calculate acoustic similarity
                acoustic_sim = self._compare_audio_segments_acoustic(segment1, segment2, sr)
                
                # High acoustic similarity suggests repetition (even if transcripts differ)
                if acoustic_sim['overall_acoustic_similarity'] > 0.75:
                    events.append(StutterEvent(
                        type='repetition',
                        start=word_timestamps[i]['start'],
                        end=word_timestamps[i+1]['end'],
                        text=f"{word_timestamps[i].get('word', '')} → {word_timestamps[i+1].get('word', '')}",
                        confidence=acoustic_sim['overall_acoustic_similarity'],
                        acoustic_features=acoustic_sim,
                        phonetic_similarity=acoustic_sim['overall_acoustic_similarity']
                    ))
            except Exception as e:
                logger.warning(f"Acoustic comparison failed for words {i}-{i+1}: {e}")
                continue
        
        return events
    
    def _detect_prolongations_by_sound(self, audio: np.ndarray, sr: int,
                                      word_timestamps: List[Dict]) -> List[StutterEvent]:
        """
        Detect prolongations by analyzing spectral stability within words
        High spectral correlation over time = prolonged sound
        """
        events = []
        
        for word_info in word_timestamps:
            try:
                start = int(word_info['start'] * sr)
                end = int(word_info['end'] * sr)
                
                if end > len(audio) or end - start < sr * 0.3:  # Skip if < 300ms
                    continue
                
                segment = audio[start:end]
                
                # Extract MFCC
                mfcc = self._extract_mfcc_features(segment, sr)
                
                if len(mfcc) < 10:  # Need sufficient frames
                    continue
                
                # Calculate frame-to-frame correlation
                correlations = []
                window_size = 5
                for i in range(len(mfcc) - window_size):
                    corr_matrix = np.corrcoef(mfcc[i:i+window_size].T)
                    avg_corr = np.mean(corr_matrix[np.triu_indices_from(corr_matrix, k=1)])
                    correlations.append(avg_corr)
                
                avg_correlation = np.mean(correlations) if correlations else 0
                
                # High correlation = prolongation (same sound repeated)
                if avg_correlation > PROLONGATION_CORRELATION_THRESHOLD:
                    duration = (end - start) / sr
                    events.append(StutterEvent(
                        type='prolongation',
                        start=word_info['start'],
                        end=word_info['end'],
                        text=word_info.get('word', ''),
                        confidence=float(avg_correlation),
                        acoustic_features={
                            'spectral_correlation': float(avg_correlation),
                            'duration': duration
                        },
                        phonetic_similarity=float(avg_correlation)
                    ))
            except Exception as e:
                logger.warning(f"Prolongation detection failed for word: {e}")
                continue
        
        return events
    
    
    def analyze_audio(self, audio_path: str, proper_transcript: str = "", language: str = 'hindi') -> dict:
        """
        🎯 ADVANCED Multi-Modal Stutter Detection Pipeline
        
        Combines:
        1. ASR Transcription (IndicWav2Vec Hindi)
        2. Phonetic-Aware Transcript Comparison
        3. Acoustic Similarity Matching (Sound-Based)
        4. Linguistic Pattern Detection
        
        This detects stutters that ASR might miss by comparing:
        - What was said (actual) vs what should be said (target)
        - How it sounds (acoustic features)
        - Common Hindi stutter patterns
        """
        start_time = time.time()
        logger.info(f"🚀 Starting advanced analysis: {audio_path}")

        # === STEP 1: Audio Loading & Preprocessing ===
        audio, sr = librosa.load(audio_path, sr=16000)
        duration = librosa.get_duration(y=audio, sr=sr)
        logger.info(f"🎵 Audio loaded: {duration:.2f}s duration")

        # === STEP 2: ASR Transcription using IndicWav2Vec Hindi ===
        transcript, word_timestamps, logits = self._transcribe_with_timestamps(audio)
        logger.info(f"📝 ASR Transcription: '{transcript}' ({len(transcript)} chars, {len(word_timestamps)} words)")
        
        # === STEP 3: Comprehensive Transcript Comparison ===
        comparison_result = self._compare_transcripts_comprehensive(transcript, proper_transcript)
        logger.info(f"🔍 Transcript comparison: {comparison_result['mismatch_percentage']}% mismatch, "
                   f"phonetic similarity: {comparison_result['phonetic_similarity']:.2f}")
        
        # === STEP 4: Multi-Modal Stutter Detection ===
        events = []
        
        # 4a. Text-based stutters from transcript comparison
        if comparison_result['has_target'] and comparison_result['mismatched_chars']:
            for i, segment in enumerate(comparison_result['mismatched_chars'][:10]):  # Limit to top 10
                events.append(StutterEvent(
                    type='mismatch',
                    start=i * 0.5,  # Approximate timing
                    end=(i + 1) * 0.5,
                    text=segment,
                    confidence=0.8,
                    acoustic_features={'source': 'transcript_comparison'},
                    phonetic_similarity=comparison_result['phonetic_similarity']
                ))
        
        # 4b. Detected linguistic patterns (repetitions, prolongations, filled pauses)
        for pattern in comparison_result.get('stutter_patterns', []):
            events.append(StutterEvent(
                type=pattern['type'],
                start=pattern.get('position', 0) * 0.5,
                end=(pattern.get('position', 0) + 1) * 0.5,
                text=pattern['text'],
                confidence=0.75,
                acoustic_features={'pattern': pattern['pattern']}
            ))
        
        # 4c. Acoustic-based detection (sound similarity)
        logger.info("🎤 Running acoustic similarity analysis...")
        acoustic_repetitions = self._detect_acoustic_repetitions(audio, sr, word_timestamps)
        events.extend(acoustic_repetitions)
        logger.info(f"✅ Found {len(acoustic_repetitions)} acoustic repetitions")
        
        acoustic_prolongations = self._detect_prolongations_by_sound(audio, sr, word_timestamps)
        events.extend(acoustic_prolongations)
        logger.info(f"✅ Found {len(acoustic_prolongations)} acoustic prolongations")
        
        # 4d. Model uncertainty regions (low confidence)
        entropy_score, low_conf_regions = self._calculate_uncertainty(logits)
        for region in low_conf_regions[:5]:  # Limit to 5 most uncertain
            events.append(StutterEvent(
                type='dysfluency',
                start=region['time'],
                end=region['time'] + 0.3,
                text="<low_confidence>",
                confidence=region['confidence'],
                acoustic_features={'entropy': entropy_score, 'model_uncertainty': True}
            ))
        
        # === STEP 5: Deduplicate and Rank Events ===
        # Remove overlapping events, keeping highest confidence
        events.sort(key=lambda e: (e.start, -e.confidence))
        deduplicated_events = []
        for event in events:
            # Check if overlaps with existing events
            overlaps = False
            for existing in deduplicated_events:
                if not (event.end < existing.start or event.start > existing.end):
                    overlaps = True
                    break
            if not overlaps:
                deduplicated_events.append(event)
        
        events = deduplicated_events
        logger.info(f"📊 Total events after deduplication: {len(events)}")
        
        # === STEP 6: Calculate Comprehensive Metrics ===
        total_duration = sum(e.end - e.start for e in events)
        frequency = (len(events) / duration * 60) if duration > 0 else 0
        
        # Mismatch percentage from transcript comparison (more accurate)
        mismatch_percentage = comparison_result['mismatch_percentage']
        
        # Severity assessment (multi-factor)
        severity_score = (
            mismatch_percentage * 0.4 +
            (total_duration / duration * 100) * 0.3 +
            (frequency / 10 * 100) * 0.3
        ) if duration > 0 else 0
        
        if severity_score < 10:
            severity = 'none'
        elif severity_score < 25:
            severity = 'mild'
        elif severity_score < 50:
            severity = 'moderate'
        else:
            severity = 'severe'
        
        # Confidence score (multi-factor)
        model_confidence = 1.0 - (entropy_score / 10.0) if entropy_score > 0 else 0.8
        phonetic_confidence = comparison_result.get('phonetic_similarity', 1.0)
        acoustic_confidence = np.mean([e.confidence for e in events if e.type in ['repetition', 'prolongation']]) if events else 0.7
        
        overall_confidence = (
            model_confidence * 0.4 +
            phonetic_confidence * 0.3 +
            acoustic_confidence * 0.3
        )
        overall_confidence = max(0.0, min(1.0, overall_confidence))

        # === STEP 7: Return Comprehensive Results ===
        actual_transcript = transcript if transcript else ""
        target_transcript = proper_transcript if proper_transcript else ""
        
        analysis_time = time.time() - start_time
        
        result = {
            # Core transcripts
            'actual_transcript': actual_transcript,
            'target_transcript': target_transcript,
            
            # Mismatch analysis
            'mismatched_chars': comparison_result.get('mismatched_chars', []),
            'mismatch_percentage': round(mismatch_percentage, 2),
            
            # Advanced comparison metrics
            'edit_distance': comparison_result.get('edit_distance', 0),
            'lcs_ratio': comparison_result.get('lcs_ratio', 1.0),
            'phonetic_similarity': comparison_result.get('phonetic_similarity', 1.0),
            'word_accuracy': comparison_result.get('word_accuracy', 1.0),
            
            # Model metrics
            'ctc_loss_score': round(entropy_score, 4),
            
            # Stutter events with acoustic features
            'stutter_timestamps': [self._event_to_dict(e) for e in events],
            'total_stutter_duration': round(total_duration, 2),
            'stutter_frequency': round(frequency, 2),
            
            # Assessment
            'severity': severity,
            'severity_score': round(severity_score, 2),
            'confidence_score': round(overall_confidence, 2),
            
            # Speaking metrics
            'speaking_rate_sps': round(len(word_timestamps) / duration if duration > 0 else 0, 2),
            
            # Metadata
            'analysis_duration_seconds': round(analysis_time, 2),
            'model_version': 'indicwav2vec-hindi-advanced-v2',
            'features_used': ['asr', 'phonetic_comparison', 'acoustic_similarity', 'pattern_detection'],
            
            # Debug info
            'debug': {
                'total_events_detected': len(events),
                'acoustic_repetitions': len(acoustic_repetitions),
                'acoustic_prolongations': len(acoustic_prolongations),
                'text_patterns': len(comparison_result.get('stutter_patterns', [])),
                'has_target_transcript': comparison_result['has_target']
            }
        }
        
        logger.info(f"✅ Analysis complete in {analysis_time:.2f}s - Severity: {severity}, "
                   f"Mismatch: {mismatch_percentage}%, Confidence: {overall_confidence:.2f}")
        
        return result
    
    
    # Model loader is now in a separate module: model_loader.py
    # This follows clean architecture principles - separation of concerns
    # Import using: from diagnosis.ai_engine.model_loader import get_stutter_detector