Spaces:

Yash030
/

claude-code-proxy

Running

App Files Files Community

claude-code-proxy / providers /nvidia_nim /voice.py

Yash030

Deploy claude-code-nvidia proxy to Hugging Face Spaces

0157ac7 28 days ago

raw

history blame contribute delete

3.18 kB

	"""NVIDIA NIM / Riva offline ASR for voice notes (provider-owned transport)."""

	from __future__ import annotations

	from pathlib import Path

	from loguru import logger

	# NVIDIA NIM Whisper model mapping: (function_id, language_code)
	_NIM_ASR_MODEL_MAP: dict[str, tuple[str, str]] = {
	"nvidia/parakeet-ctc-0.6b-zh-tw": ("8473f56d-51ef-473c-bb26-efd4f5def2bf", "zh-TW"),
	"nvidia/parakeet-ctc-0.6b-zh-cn": ("9add5ef7-322e-47e0-ad7a-5653fb8d259b", "zh-CN"),
	# function-id from NVIDIA NIM API docs (parakeet-ctc-0.6b-es).
	"nvidia/parakeet-ctc-0.6b-es": ("a9eeee8f-b509-4712-b19d-194361fa5f31", "es-US"),
	"nvidia/parakeet-ctc-0.6b-vi": ("f3dff2bb-99f9-403d-a5f1-f574a757deb0", "vi-VN"),
	"nvidia/parakeet-ctc-1.1b-asr": ("1598d209-5e27-4d3c-8079-4751568b1081", "en-US"),
	"nvidia/parakeet-ctc-0.6b-asr": ("d8dd4e9b-fbf5-4fb0-9dba-8cf436c8d965", "en-US"),
	"nvidia/parakeet-1.1b-rnnt-multilingual-asr": (
	"71203149-d3b7-4460-8231-1be2543a1fca",
	"",
	),
	"openai/whisper-large-v3": ("b702f636-f60c-4a3d-a6f4-f3568c13bd7d", "multi"),
	}

	_RIVA_SERVER = "grpc.nvcf.nvidia.com:443"


	def transcribe_audio_file(
	file_path: Path,
	model: str,
	*,
	api_key: str,
	) -> str:
	"""Transcribe audio using NVIDIA NIM / Riva gRPC (offline recognition).

	Args:
	file_path: Path to encoded audio bytes readable by Riva.
	model: Hugging Face-style NIM model id (see ``_NIM_ASR_MODEL_MAP``).
	api_key: NVIDIA API key (Bearer token); must be non-empty.

	Returns:
	Transcript text, or ``(no speech detected)`` when empty.
	"""
	key = (api_key or "").strip()
	if not key:
	raise ValueError(
	"NVIDIA NIM transcription requires a non-empty nvidia_nim_api_key "
	"(configure NVIDIA_NIM_API_KEY or pass api_key explicitly)."
	)

	try:
	import riva.client
	except ImportError as e:
	raise ImportError(
	"NVIDIA NIM transcription requires the voice extra. "
	"Install with: uv sync --extra voice"
	) from e

	model_config = _NIM_ASR_MODEL_MAP.get(model)
	if not model_config:
	raise ValueError(
	f"No NVIDIA NIM config found for model: {model}. "
	f"Supported models: {', '.join(_NIM_ASR_MODEL_MAP.keys())}"
	)
	function_id, language_code = model_config

	auth = riva.client.Auth(
	use_ssl=True,
	uri=_RIVA_SERVER,
	metadata_args=[
	["function-id", function_id],
	["authorization", f"Bearer {key}"],
	],
	)

	asr_service = riva.client.ASRService(auth)

	config = riva.client.RecognitionConfig(
	language_code=language_code,
	max_alternatives=1,
	verbatim_transcripts=True,
	)

	with open(file_path, "rb") as f:
	data = f.read()

	response = asr_service.offline_recognize(data, config)

	transcript = ""
	results = getattr(response, "results", None)
	if results and results[0].alternatives:
	transcript = results[0].alternatives[0].transcript

	logger.debug(f"NIM transcription: {len(transcript)} chars")
	return transcript or "(no speech detected)"