Spaces:
Sleeping
Sleeping
| import os | |
| import io | |
| import numpy as np | |
| import gradio as gr | |
| import riva.client | |
| import riva.client as riva_client | |
| from dotenv import load_dotenv | |
| load_dotenv() | |
| # ------------------------------- | |
| # Auth (your provided snippet) | |
| # ------------------------------- | |
| uri = "grpc.nvcf.nvidia.com:443" | |
| auth = riva_client.Auth( | |
| uri=uri, | |
| use_ssl=True, | |
| metadata_args=[ | |
| ["function-id", "b702f636-f60c-4a3d-a6f4-f3568c13bd7d"], | |
| ["authorization", f"Bearer {os.environ['NVIDIA_API']}"], | |
| ], | |
| ) | |
| # Create Riva SpeechClient | |
| asr = riva_client.ASRService(auth) | |
| # ------------------------------- | |
| # Helper: convert Gradio audio chunk to PCM16 | |
| # ------------------------------- | |
| def float_to_pcm16(audio_np: np.ndarray) -> bytes: | |
| audio_np = np.clip(audio_np, -1.0, 1.0) | |
| return (audio_np * 32767).astype(np.int16).tobytes() | |
| # ------------------------------- | |
| # Streaming generator | |
| # ---------- Generator ---------- | |
| def riva_stream_generator(audio_chunks, sample_rate=16000): | |
| """ | |
| This uses the modern Riva API: | |
| streaming_response_generator(audio_chunks, streaming_config) | |
| """ | |
| offline_config = riva.client.RecognitionConfig( | |
| language_code="en-US", | |
| # model=args.model_name, | |
| sample_rate_hertz=sample_rate, | |
| max_alternatives=1, | |
| # profanity_filter=args.profanity_filter, | |
| enable_automatic_punctuation=True, | |
| verbatim_transcripts=False, | |
| # enable_word_time_offsets=args.word_time_offsets or args.speaker_diarization, | |
| ) | |
| # Build RecognitionConfig and StreamingRecognitionConfig | |
| streaming_config = riva.client.StreamingRecognitionConfig(config=offline_config, interim_results=True) | |
| # Call the streaming generator directly with your audio iterator | |
| # Gradio will yield numpy chunks via audio_chunks | |
| def chunk_iterator(): | |
| for chunk in audio_chunks: | |
| if chunk is None: | |
| break | |
| yield float_to_pcm16(chunk) | |
| # Now call Riva streaming_response_generator | |
| responses = asr.streaming_response_generator(chunk_iterator(), streaming_config) | |
| # Parse responses and yield text updates to Gradio | |
| for resp in responses: | |
| for result in resp.results: | |
| if result.alternatives: | |
| transcript = result.alternatives[0].transcript | |
| yield transcript | |
| # ------------------------------- | |
| # Gradio UI | |
| # ------------------------------- | |
| with gr.Blocks() as demo: | |
| gr.Markdown("# ๐๏ธ NVIDIA Riva Realtime ASR โ True Streaming Demo") | |
| # This streams mic audio directly to backend in small chunks | |
| mic = gr.Audio(sources=["microphone"], streaming=True) | |
| transcript = gr.Textbox(label="Live Transcript", interactive=False, lines=6) | |
| # Wire streaming callback | |
| mic.stream(riva_stream_generator, inputs=mic, outputs=transcript) | |
| demo.launch() | |