Deepak Sahu commited on
Commit
f5d5c69
·
1 Parent(s): 4666ab5

update voice to transcription

Browse files
Files changed (9) hide show
  1. .gitignore +1 -0
  2. .vscode/launch.json +16 -0
  3. README.md +4 -0
  4. app-1.py +8 -0
  5. app.py +56 -5
  6. app3.py +86 -0
  7. requirements.txt +2 -1
  8. test1.py +8 -0
  9. test2.py +33 -0
.gitignore CHANGED
@@ -1,2 +1,3 @@
1
  /sb-voiceBot
2
  .env
 
 
1
  /sb-voiceBot
2
  .env
3
+ *.pyc
.vscode/launch.json ADDED
@@ -0,0 +1,16 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ // Use IntelliSense to learn about possible attributes.
3
+ // Hover to view descriptions of existing attributes.
4
+ // For more information, visit: https://go.microsoft.com/fwlink/?linkid=830387
5
+ "version": "0.2.0",
6
+ "configurations": [
7
+
8
+ {
9
+ "name": "Python Debugger: Current File",
10
+ "type": "debugpy",
11
+ "request": "launch",
12
+ "program": "${file}",
13
+ "console": "integratedTerminal"
14
+ }
15
+ ]
16
+ }
README.md CHANGED
@@ -11,3 +11,7 @@ short_description: NVIDIA RIVA based voiceBot
11
  ---
12
 
13
  Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
 
 
 
 
 
11
  ---
12
 
13
  Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
14
+
15
+ References used:
16
+ - https://www.gradio.app/guides/conversational-chatbot
17
+ - Riva datamodel reference: https://docs.nvidia.com/deeplearning/riva/user-guide/docs/reference/protos/protos.html#
app-1.py ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ from test1 import foo
3
+
4
+ def greet(name):
5
+ return "Hello " + name + "!!" + foo()
6
+
7
+ demo = gr.Interface(fn=greet, inputs="text", outputs="text")
8
+ demo.launch()
app.py CHANGED
@@ -1,8 +1,59 @@
1
  import gradio as gr
2
- from test1 import foo
3
 
4
- def greet(name):
5
- return "Hello " + name + "!!" + foo()
6
 
7
- demo = gr.Interface(fn=greet, inputs="text", outputs="text")
8
- demo.launch()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  import gradio as gr
 
2
 
3
+ import numpy as np
 
4
 
5
+ import io
6
+ import soundfile as sf
7
+ import numpy as np
8
+ from test1 import asr_transcribe
9
+
10
+
11
+ def audio_to_bytes(audio_input) -> bytes:
12
+ """
13
+ Convert a Gradio audio input (numpy array or filepath) to WAV bytes.
14
+
15
+ Parameters:
16
+ audio_input: tuple | str
17
+ - If tuple: (numpy_array, sample_rate)
18
+ - If str: path to an audio file
19
+
20
+ Returns:
21
+ bytes: The WAV file bytes.
22
+ """
23
+ if isinstance(audio_input, str):
24
+ # audio_input is a file path
25
+ samplerate, data = sf.read(audio_input)
26
+ elif isinstance(audio_input, (tuple, list)) and len(audio_input) == 2:
27
+ # audio_input is (numpy array, sample_rate)
28
+ samplerate, data = audio_input
29
+ else:
30
+ raise ValueError("Invalid audio input. Expected (numpy_array, sample_rate) or file path string.")
31
+
32
+
33
+ # Ensure mono (channel count = 1)
34
+ if data.ndim > 1:
35
+ data = np.mean(data, axis=1) # average channels to mono
36
+
37
+ # Write to an in-memory buffer
38
+ wav_buffer = io.BytesIO()
39
+ sf.write(wav_buffer, data, samplerate, format='WAV')
40
+ wav_bytes = wav_buffer.getvalue()
41
+ wav_buffer.close()
42
+
43
+ return wav_bytes
44
+
45
+
46
+ def transcribe(audio):
47
+ # convert the audio to bytes
48
+ audio_bytes = audio_to_bytes(audio)
49
+ transcription = asr_transcribe(audio_bytes)
50
+ # transcribe
51
+ return transcription
52
+
53
+ demo = gr.Interface(
54
+ transcribe,
55
+ gr.Audio(sources="microphone"),
56
+ "text",
57
+ )
58
+
59
+ demo.launch()
app3.py ADDED
@@ -0,0 +1,86 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import io
3
+ import numpy as np
4
+ import gradio as gr
5
+ import riva.client
6
+ import riva.client as riva_client
7
+ from dotenv import load_dotenv
8
+ load_dotenv()
9
+
10
+ # -------------------------------
11
+ # Auth (your provided snippet)
12
+ # -------------------------------
13
+ uri = "grpc.nvcf.nvidia.com:443"
14
+
15
+ auth = riva_client.Auth(
16
+ uri=uri,
17
+ use_ssl=True,
18
+ metadata_args=[
19
+ ["function-id", "b702f636-f60c-4a3d-a6f4-f3568c13bd7d"],
20
+ ["authorization", f"Bearer {os.environ['NVIDIA_API']}"],
21
+ ],
22
+ )
23
+
24
+ # Create Riva SpeechClient
25
+ asr = riva_client.ASRService(auth)
26
+
27
+ # -------------------------------
28
+ # Helper: convert Gradio audio chunk to PCM16
29
+ # -------------------------------
30
+ def float_to_pcm16(audio_np: np.ndarray) -> bytes:
31
+ audio_np = np.clip(audio_np, -1.0, 1.0)
32
+ return (audio_np * 32767).astype(np.int16).tobytes()
33
+
34
+ # -------------------------------
35
+ # Streaming generator
36
+ # ---------- Generator ----------
37
+ def riva_stream_generator(audio_chunks, sample_rate=16000):
38
+ """
39
+ This uses the modern Riva API:
40
+ streaming_response_generator(audio_chunks, streaming_config)
41
+ """
42
+ offline_config = riva.client.RecognitionConfig(
43
+ language_code="en-US",
44
+ # model=args.model_name,
45
+ sample_rate_hertz=sample_rate,
46
+ max_alternatives=1,
47
+ # profanity_filter=args.profanity_filter,
48
+ enable_automatic_punctuation=True,
49
+ verbatim_transcripts=False,
50
+ # enable_word_time_offsets=args.word_time_offsets or args.speaker_diarization,
51
+ )
52
+ # Build RecognitionConfig and StreamingRecognitionConfig
53
+ streaming_config = riva.client.StreamingRecognitionConfig(config=offline_config, interim_results=True)
54
+
55
+ # Call the streaming generator directly with your audio iterator
56
+ # Gradio will yield numpy chunks via audio_chunks
57
+ def chunk_iterator():
58
+ for chunk in audio_chunks:
59
+ if chunk is None:
60
+ break
61
+ yield float_to_pcm16(chunk)
62
+
63
+ # Now call Riva streaming_response_generator
64
+ responses = asr.streaming_response_generator(chunk_iterator(), streaming_config)
65
+
66
+ # Parse responses and yield text updates to Gradio
67
+ for resp in responses:
68
+ for result in resp.results:
69
+ if result.alternatives:
70
+ transcript = result.alternatives[0].transcript
71
+ yield transcript
72
+
73
+ # -------------------------------
74
+ # Gradio UI
75
+ # -------------------------------
76
+ with gr.Blocks() as demo:
77
+ gr.Markdown("# 🎙️ NVIDIA Riva Realtime ASR — True Streaming Demo")
78
+
79
+ # This streams mic audio directly to backend in small chunks
80
+ mic = gr.Audio(sources=["microphone"], streaming=True)
81
+ transcript = gr.Textbox(label="Live Transcript", interactive=False, lines=6)
82
+
83
+ # Wire streaming callback
84
+ mic.stream(riva_stream_generator, inputs=mic, outputs=transcript)
85
+
86
+ demo.launch()
requirements.txt CHANGED
@@ -1,3 +1,4 @@
1
  gradio
2
  nvidia-riva-client
3
- python-dotenv
 
 
1
  gradio
2
  nvidia-riva-client
3
+ python-dotenv
4
+ soundfile
test1.py CHANGED
@@ -1,4 +1,5 @@
1
  import riva.client
 
2
  from riva.client.argparse_utils import add_asr_config_argparse_parameters, add_connection_argparse_parameters
3
  import os
4
  from dotenv import load_dotenv
@@ -42,6 +43,13 @@ with open("./en-US_sample.wav", 'rb') as fh:
42
  data = fh.read()
43
 
44
 
 
 
 
 
 
 
 
45
  def foo():
46
  global data, offline_config, asr_service
47
  response = asr_service.offline_recognize(data, offline_config)
 
1
  import riva.client
2
+ import riva.client.realtime
3
  from riva.client.argparse_utils import add_asr_config_argparse_parameters, add_connection_argparse_parameters
4
  import os
5
  from dotenv import load_dotenv
 
43
  data = fh.read()
44
 
45
 
46
+ def asr_transcribe(audio: bytes):
47
+ global offline_config, asr_service
48
+ response = asr_service.offline_recognize(audio, offline_config)
49
+ transcript = " ".join([result.alternatives[0].transcript for result in response.results])
50
+ # print("Final transcript:", transcript)
51
+ return transcript
52
+
53
  def foo():
54
  global data, offline_config, asr_service
55
  response = asr_service.offline_recognize(data, offline_config)
test2.py ADDED
@@ -0,0 +1,33 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import riva.client
2
+ from riva.client.argparse_utils import add_asr_config_argparse_parameters, add_connection_argparse_parameters
3
+ import os
4
+ from dotenv import load_dotenv
5
+
6
+
7
+ # Load environment variables from .env file
8
+ load_dotenv()
9
+
10
+ uri = "grpc.nvcf.nvidia.com:443"
11
+
12
+ auth = riva.client.Auth(
13
+ uri=uri,
14
+ use_ssl=True,
15
+ metadata_args=[
16
+ ["function-id", "b702f636-f60c-4a3d-a6f4-f3568c13bd7d"],
17
+ ["authorization", f"Bearer {os.environ['NVIDIA_API']}"],
18
+ ]
19
+ )
20
+
21
+ # assuming you already created `auth`
22
+ asr = riva.client.ASRService(auth)
23
+
24
+ # list all available ASR models
25
+ models = asr.list_models()
26
+
27
+ for m in models:
28
+ print("Model name:", m.name)
29
+ print(" Description:", m.description)
30
+ print(" Type:", m.type) # 'online' or 'offline'
31
+ print(" Sample rates:", m.supported_sample_rates)
32
+ print(" Languages:", m.languages)
33
+ print()