File size: 12,632 Bytes
3f792e8 5de798e 3f792e8 5de798e 3f792e8 5de798e 3f792e8 5de798e 3f792e8 5de798e 3f792e8 5de798e 3f792e8 5de798e 3f792e8 5de798e fdcc0cf 3f792e8 5de798e 3f792e8 5de798e 3f792e8 5de798e 3f792e8 5de798e 3f792e8 5de798e 3f792e8 5de798e 3f792e8 5de798e 3f792e8 5de798e 3f792e8 5de798e 3f792e8 5de798e 3f792e8 5de798e 3f792e8 5de798e 3f792e8 5de798e 3f792e8 5de798e 3f792e8 5de798e 3f792e8 5de798e 3f792e8 5de798e 3f792e8 5de798e 3f792e8 5de798e 3f792e8 5de798e 3f792e8 5de798e 3f792e8 5de798e fdcc0cf 5de798e 3f792e8 5de798e 3f792e8 5de798e 3f792e8 5de798e 3f792e8 5de798e 3f792e8 5de798e 3f792e8 5de798e 3f792e8 5de798e 3f792e8 5de798e 3f792e8 fdcc0cf 3f792e8 5de798e 3f792e8 5de798e 3f792e8 5de798e 3f792e8 5de798e 3f792e8 5de798e 3f792e8 5de798e 3f792e8 5de798e 3f792e8 5de798e 3f792e8 5de798e 3f792e8 5de798e 3f792e8 5de798e 3f792e8 5de798e |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 |
"""
Advanced Speech Recognition Module for Multilingual Audio Intelligence System
This module implements state-of-the-art automatic speech recognition using openai-whisper
with integrated language identification capabilities. Designed for maximum performance
on CPU-constrained environments while maintaining SOTA accuracy.
Key Features:
- OpenAI Whisper with optimized backend for speed improvement
- Integrated Language Identification (no separate LID module needed)
- VAD-based batching for real-time performance on CPU
- Word-level timestamps for interactive UI synchronization
- Robust error handling and multilingual support
- CPU and GPU optimization paths
Model: openai/whisper-small (optimized for speed/accuracy balance)
Dependencies: openai-whisper, torch, numpy
"""
import os
import logging
import warnings
import numpy as np
import torch
from typing import List, Dict, Optional, Tuple, Union
import tempfile
from dataclasses import dataclass
import time
try:
import whisper
WHISPER_AVAILABLE = True
except ImportError:
WHISPER_AVAILABLE = False
logging.warning("openai-whisper not available. Install with: pip install openai-whisper")
# Configure logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)
# Suppress warnings for cleaner output
warnings.filterwarnings("ignore", category=UserWarning)
warnings.filterwarnings("ignore", category=FutureWarning)
@dataclass
class TranscriptionSegment:
"""
Data class representing a transcribed speech segment with rich metadata.
"""
start: float
end: float
text: str
language: str
language_probability: float
no_speech_probability: float
words: Optional[List[Dict]] = None
speaker_id: Optional[str] = None
confidence: Optional[float] = None
word_timestamps: Optional[List[Dict]] = None
class SpeechRecognizer:
"""
Advanced Speech Recognition Engine using OpenAI Whisper.
This class provides high-performance speech recognition with integrated language
identification, optimized for both CPU and GPU environments.
"""
def __init__(self, model_size: str = "small", device: str = "auto",
compute_type: str = "int8", language: Optional[str] = None):
"""
Initialize the Speech Recognizer.
Args:
model_size: Whisper model size (tiny, base, small, medium, large)
device: Device to use (auto, cpu, cuda)
compute_type: Computation precision (int8, float16, float32)
language: Target language code (None for auto-detection)
"""
self.model_size = model_size
self.device = self._determine_device(device)
self.compute_type = compute_type
self.language = language
self.model = None
self._initialize_model()
def _determine_device(self, device: str) -> str:
"""Determine the best available device."""
if device == "auto":
if torch.cuda.is_available():
return "cuda"
elif hasattr(torch.backends, 'mps') and torch.backends.mps.is_available():
return "mps"
else:
return "cpu"
return device
def _initialize_model(self):
"""Initialize the Whisper model."""
if not WHISPER_AVAILABLE:
raise ImportError("openai-whisper is required. Install with: pip install openai-whisper")
try:
logger.info(f"Loading {self.model_size} Whisper model...")
self.model = whisper.load_model(self.model_size, device=self.device)
logger.info(f"Speech recognition models loaded on {self.device}")
except Exception as e:
logger.error(f"Failed to load Whisper model: {e}")
raise
def transcribe_audio(self, audio_data: np.ndarray, sample_rate: int = 16000,
language: Optional[str] = None,
initial_prompt: Optional[str] = None) -> List[TranscriptionSegment]:
"""
Transcribe audio data with language identification.
Args:
audio_data: Audio data as numpy array
sample_rate: Sample rate of the audio
language: Language code (None for auto-detection)
initial_prompt: Initial prompt for better transcription
Returns:
List of TranscriptionSegment objects
"""
if self.model is None:
raise RuntimeError("Model not initialized")
try:
# Prepare audio for Whisper (expects 16kHz)
if sample_rate != 16000:
import librosa
audio_data = librosa.resample(audio_data, orig_sr=sample_rate, target_sr=16000)
# Transcribe with Whisper
result = self.model.transcribe(
audio_data,
language=language or self.language,
initial_prompt=initial_prompt,
word_timestamps=True,
verbose=False
)
# Convert to our format
segments = []
for segment in result["segments"]:
words = []
if "words" in segment:
for word in segment["words"]:
words.append({
"word": word["word"],
"start": word["start"],
"end": word["end"],
"probability": word.get("probability", 1.0)
})
segments.append(TranscriptionSegment(
start=segment["start"],
end=segment["end"],
text=segment["text"].strip(),
language=result.get("language", "unknown"),
language_probability=result.get("language_probability", 1.0),
no_speech_probability=segment.get("no_speech_prob", 0.0),
words=words,
speaker_id=None,
confidence=1.0 - segment.get("no_speech_prob", 0.0),
word_timestamps=words
))
return segments
except Exception as e:
logger.error(f"Transcription failed: {e}")
raise
def transcribe_file(self, file_path: str, language: Optional[str] = None,
initial_prompt: Optional[str] = None) -> List[TranscriptionSegment]:
"""
Transcribe an audio file.
Args:
file_path: Path to audio file
language: Language code (None for auto-detection)
initial_prompt: Initial prompt for better transcription
Returns:
List of TranscriptionSegment objects
"""
try:
# Load audio file
import librosa
audio_data, sample_rate = librosa.load(file_path, sr=16000)
return self.transcribe_audio(audio_data, sample_rate, language, initial_prompt)
except Exception as e:
logger.error(f"File transcription failed: {e}")
raise
def transcribe_segments(self, audio_data: np.ndarray, sample_rate: int,
speaker_segments: List[Tuple[float, float, str]],
word_timestamps: bool = True) -> List[TranscriptionSegment]:
"""
Transcribe audio segments with speaker information.
Args:
audio_data: Audio data as numpy array
sample_rate: Sample rate of the audio
speaker_segments: List of (start_time, end_time, speaker_id) tuples
word_timestamps: Whether to include word-level timestamps
Returns:
List of TranscriptionSegment objects with speaker information
"""
if self.model is None:
raise RuntimeError("Model not initialized")
try:
# Prepare audio for Whisper (expects 16kHz)
if sample_rate != 16000:
import librosa
audio_data = librosa.resample(audio_data, orig_sr=sample_rate, target_sr=16000)
# Transcribe the entire audio first
result = self.model.transcribe(
audio_data,
language=self.language,
word_timestamps=word_timestamps,
verbose=False
)
# Convert to our format and add speaker information
segments = []
for segment in result["segments"]:
# Find the speaker for this segment
speaker_id = "Unknown"
for start_time, end_time, spk_id in speaker_segments:
if (segment["start"] >= start_time and segment["end"] <= end_time):
speaker_id = spk_id
break
words = []
if word_timestamps and "words" in segment:
for word in segment["words"]:
words.append({
"word": word["word"],
"start": word["start"],
"end": word["end"],
"probability": word.get("probability", 1.0)
})
segments.append(TranscriptionSegment(
start=segment["start"],
end=segment["end"],
text=segment["text"].strip(),
language=result.get("language", "unknown"),
language_probability=result.get("language_probability", 1.0),
no_speech_probability=segment.get("no_speech_prob", 0.0),
words=words,
speaker_id=speaker_id, # Add speaker information
confidence=1.0 - segment.get("no_speech_prob", 0.0),
word_timestamps=words
))
return segments
except Exception as e:
logger.error(f"Segment transcription failed: {e}")
raise
def get_supported_languages(self) -> List[str]:
"""Get list of supported language codes."""
return [
"en", "zh", "de", "es", "ru", "ko", "fr", "ja", "pt", "tr", "pl", "ca", "nl", "ar", "sv", "it", "id", "hi", "fi", "vi", "he", "uk", "el", "ms", "cs", "ro", "da", "hu", "ta", "no", "th", "ur", "hr", "bg", "lt", "la", "mi", "ml", "cy", "sk", "te", "fa", "lv", "bn", "sr", "az", "sl", "kn", "et", "mk", "br", "eu", "is", "hy", "ne", "mn", "bs", "kk", "sq", "sw", "gl", "mr", "pa", "si", "km", "sn", "yo", "so", "af", "oc", "ka", "be", "tg", "sd", "gu", "am", "yi", "lo", "uz", "fo", "ht", "ps", "tk", "nn", "mt", "sa", "lb", "my", "bo", "tl", "mg", "as", "tt", "haw", "ln", "ha", "ba", "jw", "su"
]
def detect_language(self, audio_data: np.ndarray, sample_rate: int = 16000) -> Tuple[str, float]:
"""
Detect the language of audio data.
Args:
audio_data: Audio data as numpy array
sample_rate: Sample rate of the audio
Returns:
Tuple of (language_code, confidence)
"""
try:
# Prepare audio for Whisper
if sample_rate != 16000:
import librosa
audio_data = librosa.resample(audio_data, orig_sr=sample_rate, target_sr=16000)
# Detect language using Whisper
result = self.model.transcribe(audio_data, language=None, verbose=False)
return result.get("language", "unknown"), result.get("language_probability", 0.0)
except Exception as e:
logger.error(f"Language detection failed: {e}")
return "unknown", 0.0
def create_speech_recognizer(model_size: str = "small", device: str = "auto",
compute_type: str = "int8", language: Optional[str] = None) -> SpeechRecognizer:
"""
Factory function to create a SpeechRecognizer instance.
Args:
model_size: Whisper model size
device: Device to use
compute_type: Computation precision
language: Target language code
Returns:
SpeechRecognizer instance
"""
return SpeechRecognizer(model_size, device, compute_type, language) |