Yash030's picture
Deploy claude-code-nvidia proxy to Hugging Face Spaces
0157ac7
"""NVIDIA NIM / Riva offline ASR for voice notes (provider-owned transport)."""
from __future__ import annotations
from pathlib import Path
from loguru import logger
# NVIDIA NIM Whisper model mapping: (function_id, language_code)
_NIM_ASR_MODEL_MAP: dict[str, tuple[str, str]] = {
"nvidia/parakeet-ctc-0.6b-zh-tw": ("8473f56d-51ef-473c-bb26-efd4f5def2bf", "zh-TW"),
"nvidia/parakeet-ctc-0.6b-zh-cn": ("9add5ef7-322e-47e0-ad7a-5653fb8d259b", "zh-CN"),
# function-id from NVIDIA NIM API docs (parakeet-ctc-0.6b-es).
"nvidia/parakeet-ctc-0.6b-es": ("a9eeee8f-b509-4712-b19d-194361fa5f31", "es-US"),
"nvidia/parakeet-ctc-0.6b-vi": ("f3dff2bb-99f9-403d-a5f1-f574a757deb0", "vi-VN"),
"nvidia/parakeet-ctc-1.1b-asr": ("1598d209-5e27-4d3c-8079-4751568b1081", "en-US"),
"nvidia/parakeet-ctc-0.6b-asr": ("d8dd4e9b-fbf5-4fb0-9dba-8cf436c8d965", "en-US"),
"nvidia/parakeet-1.1b-rnnt-multilingual-asr": (
"71203149-d3b7-4460-8231-1be2543a1fca",
"",
),
"openai/whisper-large-v3": ("b702f636-f60c-4a3d-a6f4-f3568c13bd7d", "multi"),
}
_RIVA_SERVER = "grpc.nvcf.nvidia.com:443"
def transcribe_audio_file(
file_path: Path,
model: str,
*,
api_key: str,
) -> str:
"""Transcribe audio using NVIDIA NIM / Riva gRPC (offline recognition).
Args:
file_path: Path to encoded audio bytes readable by Riva.
model: Hugging Face-style NIM model id (see ``_NIM_ASR_MODEL_MAP``).
api_key: NVIDIA API key (Bearer token); must be non-empty.
Returns:
Transcript text, or ``(no speech detected)`` when empty.
"""
key = (api_key or "").strip()
if not key:
raise ValueError(
"NVIDIA NIM transcription requires a non-empty nvidia_nim_api_key "
"(configure NVIDIA_NIM_API_KEY or pass api_key explicitly)."
)
try:
import riva.client
except ImportError as e:
raise ImportError(
"NVIDIA NIM transcription requires the voice extra. "
"Install with: uv sync --extra voice"
) from e
model_config = _NIM_ASR_MODEL_MAP.get(model)
if not model_config:
raise ValueError(
f"No NVIDIA NIM config found for model: {model}. "
f"Supported models: {', '.join(_NIM_ASR_MODEL_MAP.keys())}"
)
function_id, language_code = model_config
auth = riva.client.Auth(
use_ssl=True,
uri=_RIVA_SERVER,
metadata_args=[
["function-id", function_id],
["authorization", f"Bearer {key}"],
],
)
asr_service = riva.client.ASRService(auth)
config = riva.client.RecognitionConfig(
language_code=language_code,
max_alternatives=1,
verbatim_transcripts=True,
)
with open(file_path, "rb") as f:
data = f.read()
response = asr_service.offline_recognize(data, config)
transcript = ""
results = getattr(response, "results", None)
if results and results[0].alternatives:
transcript = results[0].alternatives[0].transcript
logger.debug(f"NIM transcription: {len(transcript)} chars")
return transcript or "(no speech detected)"