Spaces:
Paused
Paused
File size: 3,943 Bytes
fad5c32 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 | """
Transcriber - Speech-to-text using Whisper Large v3
Handles audio transcription with automatic language detection
"""
import logging
import numpy as np
from typing import Tuple, Dict, Any
from src.models.model_manager import ModelManager
logger = logging.getLogger(__name__)
class Transcriber:
"""Handles audio transcription and language detection"""
def __init__(self):
self.model_manager = ModelManager()
def transcribe(self, audio_path: str, language: str = None) -> Dict[str, Any]:
"""
Transcribe audio file using Whisper
Args:
audio_path: Path to audio file
language: Optional ISO-639-1 language code (e.g., 'en', 'hi')
Returns:
Dict with 'text', 'language', 'segments', and metadata
"""
logger.info(f"Transcribing audio: {audio_path}")
whisper_model = self.model_manager.get_whisper_model(model_size="large")
# Load audio with Whisper's built-in loading
logger.info("Loading audio file...")
options = {
"language": language,
"temperature": 0.0, # Deterministic output
"best_of": 1,
"beam_size": 5,
"patience": 1.0,
"length_penalty": 1.0,
"repetition_penalty": 1.0,
"compression_ratio_threshold": 2.4,
"no_captions_threshold": 0.9,
}
result = whisper_model.transcribe(
audio_path,
**options,
fp16=True if self.model_manager.get_device() == "cuda" else False
)
logger.info(f"Transcription complete. Language: {result['language']}")
return {
"text": result["text"],
"language": result["language"],
"segments": result["segments"],
"language_code": self._map_language_code(result["language"])
}
def _map_language_code(self, whisper_lang: str) -> str:
"""
Map Whisper language names to ISO 639-3 codes for NLLB
"""
mapping = {
"english": "eng_Latn",
"hindi": "hin_Deva",
"bengali": "ben_Beng",
"tamil": "tam_Taml",
"telugu": "tel_Telu",
"marathi": "mar_Deva",
"gujarati": "guj_Gujr",
"kannada": "kan_Knda",
"malayalam": "mal_Mlym",
"punjabi": "pan_Guru",
"urdu": "urd_Arab",
"odia": "ory_Orya",
"assamese": "asm_Beng",
"nepali": "npi_Deva",
"sinhalese": "sin_Sinh",
"arabic": "arb_Arab",
"french": "fra_Latn",
"spanish": "spa_Latn",
"german": "deu_Latn",
"portuguese": "por_Latn",
"russian": "rus_Cyrl",
"chinese": "zho_Hans",
"japanese": "jpn_Jpan",
"korean": "kor_Hang",
}
return mapping.get(whisper_lang.lower(), "eng_Latn")
def detect_language(self, audio_path: str) -> Tuple[str, str, float]:
"""
Detect language from audio file
Returns:
Tuple of (language_name, language_code, confidence)
"""
logger.info(f"Detecting language from: {audio_path}")
whisper_model = self.model_manager.get_whisper_model(model_size="large")
# Use Whisper's language detection
result = whisper_model.detect_language(audio_path)
# Extract language and confidence
detected_lang = result
language_code = self._map_language_code(detected_lang)
logger.info(f"Detected language: {detected_lang} ({language_code})")
return detected_lang, language_code, 0.95
|