File size: 12,632 Bytes
3f792e8
 
 
5de798e
3f792e8
 
 
 
5de798e
3f792e8
5de798e
3f792e8
 
 
 
 
5de798e
3f792e8
 
 
 
 
 
 
 
 
 
 
 
 
5de798e
 
3f792e8
5de798e
 
3f792e8
 
 
 
 
 
 
 
 
 
 
 
 
 
 
5de798e
 
3f792e8
 
5de798e
 
 
fdcc0cf
 
 
3f792e8
 
 
 
5de798e
3f792e8
5de798e
 
3f792e8
 
5de798e
 
3f792e8
5de798e
3f792e8
 
5de798e
 
 
 
3f792e8
 
5de798e
3f792e8
5de798e
 
 
3f792e8
5de798e
 
 
3f792e8
5de798e
 
 
3f792e8
5de798e
 
3f792e8
5de798e
 
 
 
3f792e8
 
 
5de798e
3f792e8
 
5de798e
3f792e8
 
5de798e
 
 
3f792e8
5de798e
3f792e8
 
5de798e
 
 
 
3f792e8
 
5de798e
3f792e8
 
5de798e
3f792e8
 
5de798e
 
 
 
 
 
 
 
 
 
 
 
3f792e8
 
5de798e
3f792e8
5de798e
 
 
 
 
 
 
 
 
 
3f792e8
5de798e
 
 
 
 
 
 
fdcc0cf
 
 
 
5de798e
3f792e8
 
 
 
5de798e
 
3f792e8
5de798e
 
3f792e8
5de798e
3f792e8
 
5de798e
 
 
3f792e8
 
5de798e
3f792e8
 
5de798e
 
 
3f792e8
5de798e
3f792e8
 
5de798e
 
3f792e8
fdcc0cf
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3f792e8
5de798e
3f792e8
5de798e
3f792e8
 
5de798e
 
 
 
 
 
 
3f792e8
5de798e
 
 
 
 
 
 
 
3f792e8
5de798e
 
3f792e8
5de798e
3f792e8
 
5de798e
 
3f792e8
 
5de798e
 
3f792e8
5de798e
3f792e8
 
5de798e
 
 
 
3f792e8
 
5de798e
3f792e8
5de798e
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
"""
Advanced Speech Recognition Module for Multilingual Audio Intelligence System

This module implements state-of-the-art automatic speech recognition using openai-whisper
with integrated language identification capabilities. Designed for maximum performance 
on CPU-constrained environments while maintaining SOTA accuracy.

Key Features:
- OpenAI Whisper with optimized backend for speed improvement
- Integrated Language Identification (no separate LID module needed)
- VAD-based batching for real-time performance on CPU
- Word-level timestamps for interactive UI synchronization
- Robust error handling and multilingual support
- CPU and GPU optimization paths

Model: openai/whisper-small (optimized for speed/accuracy balance)
Dependencies: openai-whisper, torch, numpy
"""

import os
import logging
import warnings
import numpy as np
import torch
from typing import List, Dict, Optional, Tuple, Union
import tempfile
from dataclasses import dataclass
import time

try:
    import whisper
    WHISPER_AVAILABLE = True
except ImportError:
    WHISPER_AVAILABLE = False
    logging.warning("openai-whisper not available. Install with: pip install openai-whisper")

# Configure logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

# Suppress warnings for cleaner output
warnings.filterwarnings("ignore", category=UserWarning)
warnings.filterwarnings("ignore", category=FutureWarning)


@dataclass
class TranscriptionSegment:
    """
    Data class representing a transcribed speech segment with rich metadata.
    """
    start: float
    end: float
    text: str
    language: str
    language_probability: float
    no_speech_probability: float
    words: Optional[List[Dict]] = None
    speaker_id: Optional[str] = None
    confidence: Optional[float] = None
    word_timestamps: Optional[List[Dict]] = None


class SpeechRecognizer:
    """
    Advanced Speech Recognition Engine using OpenAI Whisper.
    
    This class provides high-performance speech recognition with integrated language
    identification, optimized for both CPU and GPU environments.
    """
    
    def __init__(self, model_size: str = "small", device: str = "auto", 
                 compute_type: str = "int8", language: Optional[str] = None):
        """
        Initialize the Speech Recognizer.
        
        Args:
            model_size: Whisper model size (tiny, base, small, medium, large)
            device: Device to use (auto, cpu, cuda)
            compute_type: Computation precision (int8, float16, float32)
            language: Target language code (None for auto-detection)
        """
        self.model_size = model_size
        self.device = self._determine_device(device)
        self.compute_type = compute_type
        self.language = language
        self.model = None
        self._initialize_model()
        
    def _determine_device(self, device: str) -> str:
        """Determine the best available device."""
        if device == "auto":
            if torch.cuda.is_available():
                return "cuda"
            elif hasattr(torch.backends, 'mps') and torch.backends.mps.is_available():
                return "mps"
            else:
                return "cpu"
        return device
    
    def _initialize_model(self):
        """Initialize the Whisper model."""
        if not WHISPER_AVAILABLE:
            raise ImportError("openai-whisper is required. Install with: pip install openai-whisper")
        
        try:
            logger.info(f"Loading {self.model_size} Whisper model...")
            self.model = whisper.load_model(self.model_size, device=self.device)
            logger.info(f"Speech recognition models loaded on {self.device}")
        except Exception as e:
            logger.error(f"Failed to load Whisper model: {e}")
            raise
    
    def transcribe_audio(self, audio_data: np.ndarray, sample_rate: int = 16000,
                        language: Optional[str] = None, 
                        initial_prompt: Optional[str] = None) -> List[TranscriptionSegment]:
        """
        Transcribe audio data with language identification.
        
        Args:
            audio_data: Audio data as numpy array
            sample_rate: Sample rate of the audio
            language: Language code (None for auto-detection)
            initial_prompt: Initial prompt for better transcription
            
        Returns:
            List of TranscriptionSegment objects
        """
        if self.model is None:
            raise RuntimeError("Model not initialized")
        
        try:
            # Prepare audio for Whisper (expects 16kHz)
            if sample_rate != 16000:
                import librosa
                audio_data = librosa.resample(audio_data, orig_sr=sample_rate, target_sr=16000)
            
            # Transcribe with Whisper
            result = self.model.transcribe(
                audio_data,
                language=language or self.language,
                initial_prompt=initial_prompt,
                word_timestamps=True,
                verbose=False
            )
            
            # Convert to our format
            segments = []
            for segment in result["segments"]:
                words = []
                if "words" in segment:
                    for word in segment["words"]:
                        words.append({
                            "word": word["word"],
                            "start": word["start"],
                            "end": word["end"],
                            "probability": word.get("probability", 1.0)
                        })
                
                segments.append(TranscriptionSegment(
                    start=segment["start"],
                    end=segment["end"],
                    text=segment["text"].strip(),
                    language=result.get("language", "unknown"),
                    language_probability=result.get("language_probability", 1.0),
                    no_speech_probability=segment.get("no_speech_prob", 0.0),
                    words=words,
                    speaker_id=None,
                    confidence=1.0 - segment.get("no_speech_prob", 0.0),
                    word_timestamps=words
                ))
            
            return segments
            
        except Exception as e:
            logger.error(f"Transcription failed: {e}")
            raise
    
    def transcribe_file(self, file_path: str, language: Optional[str] = None,
                       initial_prompt: Optional[str] = None) -> List[TranscriptionSegment]:
        """
        Transcribe an audio file.
        
        Args:
            file_path: Path to audio file
            language: Language code (None for auto-detection)
            initial_prompt: Initial prompt for better transcription
            
        Returns:
            List of TranscriptionSegment objects
        """
        try:
            # Load audio file
            import librosa
            audio_data, sample_rate = librosa.load(file_path, sr=16000)
            
            return self.transcribe_audio(audio_data, sample_rate, language, initial_prompt)
            
        except Exception as e:
            logger.error(f"File transcription failed: {e}")
            raise
    
    def transcribe_segments(self, audio_data: np.ndarray, sample_rate: int, 
                           speaker_segments: List[Tuple[float, float, str]], 
                           word_timestamps: bool = True) -> List[TranscriptionSegment]:
        """
        Transcribe audio segments with speaker information.
        
        Args:
            audio_data: Audio data as numpy array
            sample_rate: Sample rate of the audio
            speaker_segments: List of (start_time, end_time, speaker_id) tuples
            word_timestamps: Whether to include word-level timestamps
            
        Returns:
            List of TranscriptionSegment objects with speaker information
        """
        if self.model is None:
            raise RuntimeError("Model not initialized")
        
        try:
            # Prepare audio for Whisper (expects 16kHz)
            if sample_rate != 16000:
                import librosa
                audio_data = librosa.resample(audio_data, orig_sr=sample_rate, target_sr=16000)
            
            # Transcribe the entire audio first
            result = self.model.transcribe(
                audio_data,
                language=self.language,
                word_timestamps=word_timestamps,
                verbose=False
            )
            
            # Convert to our format and add speaker information
            segments = []
            for segment in result["segments"]:
                # Find the speaker for this segment
                speaker_id = "Unknown"
                for start_time, end_time, spk_id in speaker_segments:
                    if (segment["start"] >= start_time and segment["end"] <= end_time):
                        speaker_id = spk_id
                        break
                
                words = []
                if word_timestamps and "words" in segment:
                    for word in segment["words"]:
                        words.append({
                            "word": word["word"],
                            "start": word["start"],
                            "end": word["end"],
                            "probability": word.get("probability", 1.0)
                        })
                
                segments.append(TranscriptionSegment(
                    start=segment["start"],
                    end=segment["end"],
                    text=segment["text"].strip(),
                    language=result.get("language", "unknown"),
                    language_probability=result.get("language_probability", 1.0),
                    no_speech_probability=segment.get("no_speech_prob", 0.0),
                    words=words,
                    speaker_id=speaker_id,  # Add speaker information
                    confidence=1.0 - segment.get("no_speech_prob", 0.0),
                    word_timestamps=words
                ))
            
            return segments
            
        except Exception as e:
            logger.error(f"Segment transcription failed: {e}")
            raise

    def get_supported_languages(self) -> List[str]:
        """Get list of supported language codes."""
        return [
            "en", "zh", "de", "es", "ru", "ko", "fr", "ja", "pt", "tr", "pl", "ca", "nl", "ar", "sv", "it", "id", "hi", "fi", "vi", "he", "uk", "el", "ms", "cs", "ro", "da", "hu", "ta", "no", "th", "ur", "hr", "bg", "lt", "la", "mi", "ml", "cy", "sk", "te", "fa", "lv", "bn", "sr", "az", "sl", "kn", "et", "mk", "br", "eu", "is", "hy", "ne", "mn", "bs", "kk", "sq", "sw", "gl", "mr", "pa", "si", "km", "sn", "yo", "so", "af", "oc", "ka", "be", "tg", "sd", "gu", "am", "yi", "lo", "uz", "fo", "ht", "ps", "tk", "nn", "mt", "sa", "lb", "my", "bo", "tl", "mg", "as", "tt", "haw", "ln", "ha", "ba", "jw", "su"
        ]
    
    def detect_language(self, audio_data: np.ndarray, sample_rate: int = 16000) -> Tuple[str, float]:
        """
        Detect the language of audio data.
        
        Args:
            audio_data: Audio data as numpy array
            sample_rate: Sample rate of the audio
            
        Returns:
            Tuple of (language_code, confidence)
        """
        try:
            # Prepare audio for Whisper
            if sample_rate != 16000:
                import librosa
                audio_data = librosa.resample(audio_data, orig_sr=sample_rate, target_sr=16000)
            
            # Detect language using Whisper
            result = self.model.transcribe(audio_data, language=None, verbose=False)
            
            return result.get("language", "unknown"), result.get("language_probability", 0.0)
            
        except Exception as e:
            logger.error(f"Language detection failed: {e}")
            return "unknown", 0.0


def create_speech_recognizer(model_size: str = "small", device: str = "auto",
                           compute_type: str = "int8", language: Optional[str] = None) -> SpeechRecognizer:
    """
    Factory function to create a SpeechRecognizer instance.
    
    Args:
        model_size: Whisper model size
        device: Device to use
        compute_type: Computation precision
        language: Target language code
        
    Returns:
        SpeechRecognizer instance
    """
    return SpeechRecognizer(model_size, device, compute_type, language)