voiceBot / app3.py
Deepak Sahu
update voice to transcription
f5d5c69
import os
import io
import numpy as np
import gradio as gr
import riva.client
import riva.client as riva_client
from dotenv import load_dotenv
load_dotenv()
# -------------------------------
# Auth (your provided snippet)
# -------------------------------
uri = "grpc.nvcf.nvidia.com:443"
auth = riva_client.Auth(
uri=uri,
use_ssl=True,
metadata_args=[
["function-id", "b702f636-f60c-4a3d-a6f4-f3568c13bd7d"],
["authorization", f"Bearer {os.environ['NVIDIA_API']}"],
],
)
# Create Riva SpeechClient
asr = riva_client.ASRService(auth)
# -------------------------------
# Helper: convert Gradio audio chunk to PCM16
# -------------------------------
def float_to_pcm16(audio_np: np.ndarray) -> bytes:
audio_np = np.clip(audio_np, -1.0, 1.0)
return (audio_np * 32767).astype(np.int16).tobytes()
# -------------------------------
# Streaming generator
# ---------- Generator ----------
def riva_stream_generator(audio_chunks, sample_rate=16000):
"""
This uses the modern Riva API:
streaming_response_generator(audio_chunks, streaming_config)
"""
offline_config = riva.client.RecognitionConfig(
language_code="en-US",
# model=args.model_name,
sample_rate_hertz=sample_rate,
max_alternatives=1,
# profanity_filter=args.profanity_filter,
enable_automatic_punctuation=True,
verbatim_transcripts=False,
# enable_word_time_offsets=args.word_time_offsets or args.speaker_diarization,
)
# Build RecognitionConfig and StreamingRecognitionConfig
streaming_config = riva.client.StreamingRecognitionConfig(config=offline_config, interim_results=True)
# Call the streaming generator directly with your audio iterator
# Gradio will yield numpy chunks via audio_chunks
def chunk_iterator():
for chunk in audio_chunks:
if chunk is None:
break
yield float_to_pcm16(chunk)
# Now call Riva streaming_response_generator
responses = asr.streaming_response_generator(chunk_iterator(), streaming_config)
# Parse responses and yield text updates to Gradio
for resp in responses:
for result in resp.results:
if result.alternatives:
transcript = result.alternatives[0].transcript
yield transcript
# -------------------------------
# Gradio UI
# -------------------------------
with gr.Blocks() as demo:
gr.Markdown("# ๐ŸŽ™๏ธ NVIDIA Riva Realtime ASR โ€” True Streaming Demo")
# This streams mic audio directly to backend in small chunks
mic = gr.Audio(sources=["microphone"], streaming=True)
transcript = gr.Textbox(label="Live Transcript", interactive=False, lines=6)
# Wire streaming callback
mic.stream(riva_stream_generator, inputs=mic, outputs=transcript)
demo.launch()