import os import io import numpy as np import gradio as gr import riva.client import riva.client as riva_client from dotenv import load_dotenv load_dotenv() # ------------------------------- # Auth (your provided snippet) # ------------------------------- uri = "grpc.nvcf.nvidia.com:443" auth = riva_client.Auth( uri=uri, use_ssl=True, metadata_args=[ ["function-id", "b702f636-f60c-4a3d-a6f4-f3568c13bd7d"], ["authorization", f"Bearer {os.environ['NVIDIA_API']}"], ], ) # Create Riva SpeechClient asr = riva_client.ASRService(auth) # ------------------------------- # Helper: convert Gradio audio chunk to PCM16 # ------------------------------- def float_to_pcm16(audio_np: np.ndarray) -> bytes: audio_np = np.clip(audio_np, -1.0, 1.0) return (audio_np * 32767).astype(np.int16).tobytes() # ------------------------------- # Streaming generator # ---------- Generator ---------- def riva_stream_generator(audio_chunks, sample_rate=16000): """ This uses the modern Riva API: streaming_response_generator(audio_chunks, streaming_config) """ offline_config = riva.client.RecognitionConfig( language_code="en-US", # model=args.model_name, sample_rate_hertz=sample_rate, max_alternatives=1, # profanity_filter=args.profanity_filter, enable_automatic_punctuation=True, verbatim_transcripts=False, # enable_word_time_offsets=args.word_time_offsets or args.speaker_diarization, ) # Build RecognitionConfig and StreamingRecognitionConfig streaming_config = riva.client.StreamingRecognitionConfig(config=offline_config, interim_results=True) # Call the streaming generator directly with your audio iterator # Gradio will yield numpy chunks via audio_chunks def chunk_iterator(): for chunk in audio_chunks: if chunk is None: break yield float_to_pcm16(chunk) # Now call Riva streaming_response_generator responses = asr.streaming_response_generator(chunk_iterator(), streaming_config) # Parse responses and yield text updates to Gradio for resp in responses: for result in resp.results: if result.alternatives: transcript = result.alternatives[0].transcript yield transcript # ------------------------------- # Gradio UI # ------------------------------- with gr.Blocks() as demo: gr.Markdown("# 🎙️ NVIDIA Riva Realtime ASR — True Streaming Demo") # This streams mic audio directly to backend in small chunks mic = gr.Audio(sources=["microphone"], streaming=True) transcript = gr.Textbox(label="Live Transcript", interactive=False, lines=6) # Wire streaming callback mic.stream(riva_stream_generator, inputs=mic, outputs=transcript) demo.launch()