Spaces:
Running
Running
| """Voice note transcription for messaging platforms. | |
| Supports: | |
| - Local Whisper (cpu/cuda): Hugging Face transformers pipeline | |
| - NVIDIA NIM: NVIDIA NIM Whisper/Parakeet | |
| """ | |
| from pathlib import Path | |
| from typing import Any | |
| from loguru import logger | |
| from providers.nvidia_nim.voice import ( | |
| transcribe_audio_file as transcribe_nvidia_nim_audio, | |
| ) | |
| # Max file size in bytes (25 MB) | |
| MAX_AUDIO_SIZE_BYTES = 25 * 1024 * 1024 | |
| # Short model names -> full Hugging Face model IDs (for local Whisper) | |
| _MODEL_MAP: dict[str, str] = { | |
| "tiny": "openai/whisper-tiny", | |
| "base": "openai/whisper-base", | |
| "small": "openai/whisper-small", | |
| "medium": "openai/whisper-medium", | |
| "large-v2": "openai/whisper-large-v2", | |
| "large-v3": "openai/whisper-large-v3", | |
| "large-v3-turbo": "openai/whisper-large-v3-turbo", | |
| } | |
| # Lazy-loaded pipelines: (model_id, device, hf_token_fingerprint) -> pipeline | |
| _pipeline_cache: dict[tuple[str, str, str], Any] = {} | |
| def _resolve_model_id(whisper_model: str) -> str: | |
| """Resolve short name to full Hugging Face model ID.""" | |
| return _MODEL_MAP.get(whisper_model, whisper_model) | |
| def _get_pipeline(model_id: str, device: str, hf_token: str = "") -> Any: | |
| """Lazy-load transformers Whisper pipeline. Raises ImportError if not installed.""" | |
| global _pipeline_cache | |
| if device not in ("cpu", "cuda"): | |
| raise ValueError(f"whisper_device must be 'cpu' or 'cuda', got {device!r}") | |
| resolved_token = hf_token or "" | |
| cache_key = (model_id, device, resolved_token) | |
| if cache_key not in _pipeline_cache: | |
| try: | |
| import torch | |
| from transformers import AutoModelForSpeechSeq2Seq, AutoProcessor, pipeline | |
| hf_auth_token = resolved_token or None | |
| use_cuda = device == "cuda" and torch.cuda.is_available() | |
| pipe_device = "cuda:0" if use_cuda else "cpu" | |
| model_dtype = torch.float16 if use_cuda else torch.float32 | |
| model = AutoModelForSpeechSeq2Seq.from_pretrained( | |
| model_id, | |
| dtype=model_dtype, | |
| low_cpu_mem_usage=True, | |
| attn_implementation="sdpa", | |
| token=hf_auth_token, | |
| ) | |
| model = model.to(pipe_device) | |
| processor = AutoProcessor.from_pretrained(model_id, token=hf_auth_token) | |
| pipe = pipeline( | |
| "automatic-speech-recognition", | |
| model=model, | |
| tokenizer=processor.tokenizer, | |
| feature_extractor=processor.feature_extractor, | |
| device=pipe_device, | |
| ) | |
| _pipeline_cache[cache_key] = pipe | |
| logger.debug( | |
| f"Loaded Whisper pipeline: model={model_id} device={pipe_device}" | |
| ) | |
| except ImportError as e: | |
| raise ImportError( | |
| "Local Whisper requires the voice_local extra. Install with: uv sync --extra voice_local" | |
| ) from e | |
| return _pipeline_cache[cache_key] | |
| def transcribe_audio( | |
| file_path: Path, | |
| mime_type: str, | |
| *, | |
| whisper_model: str = "base", | |
| whisper_device: str = "cpu", | |
| hf_token: str = "", | |
| nvidia_nim_api_key: str = "", | |
| ) -> str: | |
| """ | |
| Transcribe audio file to text. | |
| Supports: | |
| - whisper_device="cpu"/"cuda": local Whisper (requires voice_local extra) | |
| - whisper_device="nvidia_nim": NVIDIA NIM Whisper API (requires voice extra) | |
| Args: | |
| file_path: Path to audio file (OGG, MP3, MP4, WAV, M4A supported) | |
| mime_type: MIME type of the audio (e.g. "audio/ogg") | |
| whisper_model: Model ID or short name (local) or NVIDIA NIM model | |
| whisper_device: "cpu" | "cuda" | "nvidia_nim" | |
| Returns: | |
| Transcribed text | |
| Raises: | |
| FileNotFoundError: If file does not exist | |
| ValueError: If file too large | |
| ImportError: If voice_local extra not installed (for local Whisper) | |
| """ | |
| if not file_path.exists(): | |
| raise FileNotFoundError(f"Audio file not found: {file_path}") | |
| size = file_path.stat().st_size | |
| if size > MAX_AUDIO_SIZE_BYTES: | |
| raise ValueError( | |
| f"Audio file too large ({size} bytes). Max {MAX_AUDIO_SIZE_BYTES} bytes." | |
| ) | |
| if whisper_device == "nvidia_nim": | |
| return transcribe_nvidia_nim_audio( | |
| file_path, whisper_model, api_key=nvidia_nim_api_key | |
| ) | |
| return _transcribe_local( | |
| file_path, whisper_model, whisper_device, hf_token=hf_token | |
| ) | |
| # Whisper expects 16 kHz sample rate | |
| _WHISPER_SAMPLE_RATE = 16000 | |
| def _load_audio(file_path: Path) -> dict[str, Any]: | |
| """Load audio file to waveform dict. No ffmpeg required.""" | |
| import librosa | |
| waveform, sr = librosa.load(str(file_path), sr=_WHISPER_SAMPLE_RATE, mono=True) | |
| return {"array": waveform, "sampling_rate": sr} | |
| def _transcribe_local( | |
| file_path: Path, | |
| whisper_model: str, | |
| whisper_device: str, | |
| *, | |
| hf_token: str = "", | |
| ) -> str: | |
| """Transcribe using transformers Whisper pipeline.""" | |
| model_id = _resolve_model_id(whisper_model) | |
| pipe = _get_pipeline(model_id, whisper_device, hf_token=hf_token) | |
| audio = _load_audio(file_path) | |
| result = pipe(audio, generate_kwargs={"language": "en", "task": "transcribe"}) | |
| text = result.get("text", "") or "" | |
| if isinstance(text, list): | |
| text = " ".join(text) if text else "" | |
| result_text = text.strip() | |
| logger.debug(f"Local transcription: {len(result_text)} chars") | |
| return result_text or "(no speech detected)" | |