Spaces:
Running
Running
| """ | |
| Transcriber - Speech-to-text using Whisper Large v3 | |
| Handles audio transcription with automatic language detection | |
| """ | |
| import logging | |
| import numpy as np | |
| from typing import Tuple, Dict, Any | |
| from src.models.model_manager import ModelManager | |
| logger = logging.getLogger(__name__) | |
| class Transcriber: | |
| """Handles audio transcription and language detection""" | |
| def __init__(self): | |
| self.model_manager = ModelManager() | |
| def transcribe(self, audio_path: str, language: str = None) -> Dict[str, Any]: | |
| """ | |
| Transcribe audio file using Whisper | |
| Args: | |
| audio_path: Path to audio file | |
| language: Optional ISO-639-1 language code (e.g., 'en', 'hi') | |
| Returns: | |
| Dict with 'text', 'language', 'segments', and metadata | |
| """ | |
| logger.info(f"Transcribing audio: {audio_path}") | |
| whisper_model = self.model_manager.get_whisper_model(model_size="large") | |
| # Load audio with Whisper's built-in loading | |
| logger.info("Loading audio file...") | |
| options = { | |
| "language": language, | |
| "temperature": 0.0, # Deterministic output | |
| "best_of": 1, | |
| "beam_size": 5, | |
| "patience": 1.0, | |
| "length_penalty": 1.0, | |
| "repetition_penalty": 1.0, | |
| "compression_ratio_threshold": 2.4, | |
| "no_captions_threshold": 0.9, | |
| } | |
| result = whisper_model.transcribe( | |
| audio_path, | |
| **options, | |
| fp16=True if self.model_manager.get_device() == "cuda" else False | |
| ) | |
| logger.info(f"Transcription complete. Language: {result['language']}") | |
| return { | |
| "text": result["text"], | |
| "language": result["language"], | |
| "segments": result["segments"], | |
| "language_code": self._map_language_code(result["language"]) | |
| } | |
| def _map_language_code(self, whisper_lang: str) -> str: | |
| """ | |
| Map Whisper language names to ISO 639-3 codes for NLLB | |
| """ | |
| mapping = { | |
| "english": "eng_Latn", | |
| "hindi": "hin_Deva", | |
| "bengali": "ben_Beng", | |
| "tamil": "tam_Taml", | |
| "telugu": "tel_Telu", | |
| "marathi": "mar_Deva", | |
| "gujarati": "guj_Gujr", | |
| "kannada": "kan_Knda", | |
| "malayalam": "mal_Mlym", | |
| "punjabi": "pan_Guru", | |
| "urdu": "urd_Arab", | |
| "odia": "ory_Orya", | |
| "assamese": "asm_Beng", | |
| "nepali": "npi_Deva", | |
| "sinhalese": "sin_Sinh", | |
| "arabic": "arb_Arab", | |
| "french": "fra_Latn", | |
| "spanish": "spa_Latn", | |
| "german": "deu_Latn", | |
| "portuguese": "por_Latn", | |
| "russian": "rus_Cyrl", | |
| "chinese": "zho_Hans", | |
| "japanese": "jpn_Jpan", | |
| "korean": "kor_Hang", | |
| } | |
| return mapping.get(whisper_lang.lower(), "eng_Latn") | |
| def detect_language(self, audio_path: str) -> Tuple[str, str, float]: | |
| """ | |
| Detect language from audio file | |
| Returns: | |
| Tuple of (language_name, language_code, confidence) | |
| """ | |
| logger.info(f"Detecting language from: {audio_path}") | |
| whisper_model = self.model_manager.get_whisper_model(model_size="large") | |
| # Use Whisper's language detection | |
| result = whisper_model.detect_language(audio_path) | |
| # Extract language and confidence | |
| detected_lang = result | |
| language_code = self._map_language_code(detected_lang) | |
| logger.info(f"Detected language: {detected_lang} ({language_code})") | |
| return detected_lang, language_code, 0.95 | |