Spaces:
Running
Running
| """NVIDIA NIM / Riva offline ASR for voice notes (provider-owned transport).""" | |
| from __future__ import annotations | |
| from pathlib import Path | |
| from loguru import logger | |
| # NVIDIA NIM Whisper model mapping: (function_id, language_code) | |
| _NIM_ASR_MODEL_MAP: dict[str, tuple[str, str]] = { | |
| "nvidia/parakeet-ctc-0.6b-zh-tw": ("8473f56d-51ef-473c-bb26-efd4f5def2bf", "zh-TW"), | |
| "nvidia/parakeet-ctc-0.6b-zh-cn": ("9add5ef7-322e-47e0-ad7a-5653fb8d259b", "zh-CN"), | |
| # function-id from NVIDIA NIM API docs (parakeet-ctc-0.6b-es). | |
| "nvidia/parakeet-ctc-0.6b-es": ("a9eeee8f-b509-4712-b19d-194361fa5f31", "es-US"), | |
| "nvidia/parakeet-ctc-0.6b-vi": ("f3dff2bb-99f9-403d-a5f1-f574a757deb0", "vi-VN"), | |
| "nvidia/parakeet-ctc-1.1b-asr": ("1598d209-5e27-4d3c-8079-4751568b1081", "en-US"), | |
| "nvidia/parakeet-ctc-0.6b-asr": ("d8dd4e9b-fbf5-4fb0-9dba-8cf436c8d965", "en-US"), | |
| "nvidia/parakeet-1.1b-rnnt-multilingual-asr": ( | |
| "71203149-d3b7-4460-8231-1be2543a1fca", | |
| "", | |
| ), | |
| "openai/whisper-large-v3": ("b702f636-f60c-4a3d-a6f4-f3568c13bd7d", "multi"), | |
| } | |
| _RIVA_SERVER = "grpc.nvcf.nvidia.com:443" | |
| def transcribe_audio_file( | |
| file_path: Path, | |
| model: str, | |
| *, | |
| api_key: str, | |
| ) -> str: | |
| """Transcribe audio using NVIDIA NIM / Riva gRPC (offline recognition). | |
| Args: | |
| file_path: Path to encoded audio bytes readable by Riva. | |
| model: Hugging Face-style NIM model id (see ``_NIM_ASR_MODEL_MAP``). | |
| api_key: NVIDIA API key (Bearer token); must be non-empty. | |
| Returns: | |
| Transcript text, or ``(no speech detected)`` when empty. | |
| """ | |
| key = (api_key or "").strip() | |
| if not key: | |
| raise ValueError( | |
| "NVIDIA NIM transcription requires a non-empty nvidia_nim_api_key " | |
| "(configure NVIDIA_NIM_API_KEY or pass api_key explicitly)." | |
| ) | |
| try: | |
| import riva.client | |
| except ImportError as e: | |
| raise ImportError( | |
| "NVIDIA NIM transcription requires the voice extra. " | |
| "Install with: uv sync --extra voice" | |
| ) from e | |
| model_config = _NIM_ASR_MODEL_MAP.get(model) | |
| if not model_config: | |
| raise ValueError( | |
| f"No NVIDIA NIM config found for model: {model}. " | |
| f"Supported models: {', '.join(_NIM_ASR_MODEL_MAP.keys())}" | |
| ) | |
| function_id, language_code = model_config | |
| auth = riva.client.Auth( | |
| use_ssl=True, | |
| uri=_RIVA_SERVER, | |
| metadata_args=[ | |
| ["function-id", function_id], | |
| ["authorization", f"Bearer {key}"], | |
| ], | |
| ) | |
| asr_service = riva.client.ASRService(auth) | |
| config = riva.client.RecognitionConfig( | |
| language_code=language_code, | |
| max_alternatives=1, | |
| verbatim_transcripts=True, | |
| ) | |
| with open(file_path, "rb") as f: | |
| data = f.read() | |
| response = asr_service.offline_recognize(data, config) | |
| transcript = "" | |
| results = getattr(response, "results", None) | |
| if results and results[0].alternatives: | |
| transcript = results[0].alternatives[0].transcript | |
| logger.debug(f"NIM transcription: {len(transcript)} chars") | |
| return transcript or "(no speech detected)" | |