AudioDubbAi / src /core /transcriber.py
vasugo05's picture
Upload 24 files
fad5c32 verified
"""
Transcriber - Speech-to-text using Whisper Large v3
Handles audio transcription with automatic language detection
"""
import logging
import numpy as np
from typing import Tuple, Dict, Any
from src.models.model_manager import ModelManager
logger = logging.getLogger(__name__)
class Transcriber:
"""Handles audio transcription and language detection"""
def __init__(self):
self.model_manager = ModelManager()
def transcribe(self, audio_path: str, language: str = None) -> Dict[str, Any]:
"""
Transcribe audio file using Whisper
Args:
audio_path: Path to audio file
language: Optional ISO-639-1 language code (e.g., 'en', 'hi')
Returns:
Dict with 'text', 'language', 'segments', and metadata
"""
logger.info(f"Transcribing audio: {audio_path}")
whisper_model = self.model_manager.get_whisper_model(model_size="large")
# Load audio with Whisper's built-in loading
logger.info("Loading audio file...")
options = {
"language": language,
"temperature": 0.0, # Deterministic output
"best_of": 1,
"beam_size": 5,
"patience": 1.0,
"length_penalty": 1.0,
"repetition_penalty": 1.0,
"compression_ratio_threshold": 2.4,
"no_captions_threshold": 0.9,
}
result = whisper_model.transcribe(
audio_path,
**options,
fp16=True if self.model_manager.get_device() == "cuda" else False
)
logger.info(f"Transcription complete. Language: {result['language']}")
return {
"text": result["text"],
"language": result["language"],
"segments": result["segments"],
"language_code": self._map_language_code(result["language"])
}
def _map_language_code(self, whisper_lang: str) -> str:
"""
Map Whisper language names to ISO 639-3 codes for NLLB
"""
mapping = {
"english": "eng_Latn",
"hindi": "hin_Deva",
"bengali": "ben_Beng",
"tamil": "tam_Taml",
"telugu": "tel_Telu",
"marathi": "mar_Deva",
"gujarati": "guj_Gujr",
"kannada": "kan_Knda",
"malayalam": "mal_Mlym",
"punjabi": "pan_Guru",
"urdu": "urd_Arab",
"odia": "ory_Orya",
"assamese": "asm_Beng",
"nepali": "npi_Deva",
"sinhalese": "sin_Sinh",
"arabic": "arb_Arab",
"french": "fra_Latn",
"spanish": "spa_Latn",
"german": "deu_Latn",
"portuguese": "por_Latn",
"russian": "rus_Cyrl",
"chinese": "zho_Hans",
"japanese": "jpn_Jpan",
"korean": "kor_Hang",
}
return mapping.get(whisper_lang.lower(), "eng_Latn")
def detect_language(self, audio_path: str) -> Tuple[str, str, float]:
"""
Detect language from audio file
Returns:
Tuple of (language_name, language_code, confidence)
"""
logger.info(f"Detecting language from: {audio_path}")
whisper_model = self.model_manager.get_whisper_model(model_size="large")
# Use Whisper's language detection
result = whisper_model.detect_language(audio_path)
# Extract language and confidence
detected_lang = result
language_code = self._map_language_code(detected_lang)
logger.info(f"Detected language: {detected_lang} ({language_code})")
return detected_lang, language_code, 0.95