File size: 2,848 Bytes
f5d5c69
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
import os
import io
import numpy as np
import gradio as gr
import riva.client
import riva.client as riva_client
from dotenv import load_dotenv
load_dotenv()

# -------------------------------
# Auth (your provided snippet)
# -------------------------------
uri = "grpc.nvcf.nvidia.com:443"

auth = riva_client.Auth(
    uri=uri,
    use_ssl=True,
    metadata_args=[
        ["function-id", "b702f636-f60c-4a3d-a6f4-f3568c13bd7d"],
        ["authorization", f"Bearer {os.environ['NVIDIA_API']}"],
    ],
)

# Create Riva SpeechClient
asr = riva_client.ASRService(auth)

# -------------------------------
# Helper: convert Gradio audio chunk to PCM16
# -------------------------------
def float_to_pcm16(audio_np: np.ndarray) -> bytes:
    audio_np = np.clip(audio_np, -1.0, 1.0)
    return (audio_np * 32767).astype(np.int16).tobytes()

# -------------------------------
# Streaming generator
# ---------- Generator ----------
def riva_stream_generator(audio_chunks, sample_rate=16000):
    """
    This uses the modern Riva API:
      streaming_response_generator(audio_chunks, streaming_config)
    """
    offline_config = riva.client.RecognitionConfig(
        language_code="en-US",
        # model=args.model_name,
        sample_rate_hertz=sample_rate,
        max_alternatives=1,
        # profanity_filter=args.profanity_filter,
        enable_automatic_punctuation=True,
        verbatim_transcripts=False,
        # enable_word_time_offsets=args.word_time_offsets or args.speaker_diarization,
    )
    # Build RecognitionConfig and StreamingRecognitionConfig
    streaming_config = riva.client.StreamingRecognitionConfig(config=offline_config, interim_results=True)

    # Call the streaming generator directly with your audio iterator
    # Gradio will yield numpy chunks via audio_chunks
    def chunk_iterator():
        for chunk in audio_chunks:
            if chunk is None:
                break
            yield float_to_pcm16(chunk)

    # Now call Riva streaming_response_generator
    responses = asr.streaming_response_generator(chunk_iterator(), streaming_config)

    # Parse responses and yield text updates to Gradio
    for resp in responses:
        for result in resp.results:
            if result.alternatives:
                transcript = result.alternatives[0].transcript
                yield transcript

# -------------------------------
# Gradio UI
# -------------------------------
with gr.Blocks() as demo:
    gr.Markdown("# 🎙️ NVIDIA Riva Realtime ASR — True Streaming Demo")

    # This streams mic audio directly to backend in small chunks
    mic = gr.Audio(sources=["microphone"], streaming=True)
    transcript = gr.Textbox(label="Live Transcript", interactive=False, lines=6)

    # Wire streaming callback
    mic.stream(riva_stream_generator, inputs=mic, outputs=transcript)

demo.launch()