""" Transcriber - Speech-to-text using Whisper Large v3 Handles audio transcription with automatic language detection """ import logging import numpy as np from typing import Tuple, Dict, Any from src.models.model_manager import ModelManager logger = logging.getLogger(__name__) class Transcriber: """Handles audio transcription and language detection""" def __init__(self): self.model_manager = ModelManager() def transcribe(self, audio_path: str, language: str = None) -> Dict[str, Any]: """ Transcribe audio file using Whisper Args: audio_path: Path to audio file language: Optional ISO-639-1 language code (e.g., 'en', 'hi') Returns: Dict with 'text', 'language', 'segments', and metadata """ logger.info(f"Transcribing audio: {audio_path}") whisper_model = self.model_manager.get_whisper_model(model_size="large") # Load audio with Whisper's built-in loading logger.info("Loading audio file...") options = { "language": language, "temperature": 0.0, # Deterministic output "best_of": 1, "beam_size": 5, "patience": 1.0, "length_penalty": 1.0, "repetition_penalty": 1.0, "compression_ratio_threshold": 2.4, "no_captions_threshold": 0.9, } result = whisper_model.transcribe( audio_path, **options, fp16=True if self.model_manager.get_device() == "cuda" else False ) logger.info(f"Transcription complete. Language: {result['language']}") return { "text": result["text"], "language": result["language"], "segments": result["segments"], "language_code": self._map_language_code(result["language"]) } def _map_language_code(self, whisper_lang: str) -> str: """ Map Whisper language names to ISO 639-3 codes for NLLB """ mapping = { "english": "eng_Latn", "hindi": "hin_Deva", "bengali": "ben_Beng", "tamil": "tam_Taml", "telugu": "tel_Telu", "marathi": "mar_Deva", "gujarati": "guj_Gujr", "kannada": "kan_Knda", "malayalam": "mal_Mlym", "punjabi": "pan_Guru", "urdu": "urd_Arab", "odia": "ory_Orya", "assamese": "asm_Beng", "nepali": "npi_Deva", "sinhalese": "sin_Sinh", "arabic": "arb_Arab", "french": "fra_Latn", "spanish": "spa_Latn", "german": "deu_Latn", "portuguese": "por_Latn", "russian": "rus_Cyrl", "chinese": "zho_Hans", "japanese": "jpn_Jpan", "korean": "kor_Hang", } return mapping.get(whisper_lang.lower(), "eng_Latn") def detect_language(self, audio_path: str) -> Tuple[str, str, float]: """ Detect language from audio file Returns: Tuple of (language_name, language_code, confidence) """ logger.info(f"Detecting language from: {audio_path}") whisper_model = self.model_manager.get_whisper_model(model_size="large") # Use Whisper's language detection result = whisper_model.detect_language(audio_path) # Extract language and confidence detected_lang = result language_code = self._map_language_code(detected_lang) logger.info(f"Detected language: {detected_lang} ({language_code})") return detected_lang, language_code, 0.95