SreekarB commited on
Commit
f7b85fd
·
verified ·
1 Parent(s): e94791f

Upload 10 files

Browse files
Files changed (3) hide show
  1. app.py +46 -3
  2. hf_audio_utils.py +208 -0
  3. nova_sonic_tool_use.py +88 -16
app.py CHANGED
@@ -1,3 +1,4 @@
 
1
  import threading
2
  import time
3
  import argparse
@@ -8,6 +9,26 @@ from session_manager import SessionManager
8
  from config import UI_TITLE, UI_SUBTITLE
9
  import gradio as gr
10
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
11
  class NovaConversationApp:
12
  def __init__(self, session_id=None):
13
  # Initialize core components
@@ -43,11 +64,33 @@ class NovaConversationApp:
43
 
44
  # Run initialization in the event loop
45
  try:
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
46
  # Initialize stream manager
47
- self.stream_manager = BedrockStreamManager(model_id='amazon.nova-sonic-v1:0', region='us-east-1')
 
48
 
49
- # Initialize audio streamer
50
- self.audio_streamer = AudioStreamer(self.stream_manager)
 
 
 
 
 
51
 
52
  # Initialize the stream in the event loop
53
  self.loop.run_until_complete(self._initialize_streaming())
 
1
+ import os
2
  import threading
3
  import time
4
  import argparse
 
9
  from config import UI_TITLE, UI_SUBTITLE
10
  import gradio as gr
11
 
12
+ # Import dotenv for environment variables if available
13
+ try:
14
+ from dotenv import load_dotenv
15
+ # Load environment variables from .env file if it exists
16
+ load_dotenv()
17
+ except ImportError:
18
+ pass
19
+
20
+ # Import HF-specific audio utils
21
+ try:
22
+ from hf_audio_utils import HFAudioStreamer
23
+ HF_AUDIO_AVAILABLE = True
24
+ except ImportError:
25
+ HF_AUDIO_AVAILABLE = False
26
+
27
+ # Check if we're in HF Spaces
28
+ def is_huggingface_spaces():
29
+ """Detect if we're running on HuggingFace Spaces"""
30
+ return "SPACE_ID" in os.environ or "SYSTEM" in os.environ and os.environ.get("SYSTEM") == "spaces"
31
+
32
  class NovaConversationApp:
33
  def __init__(self, session_id=None):
34
  # Initialize core components
 
64
 
65
  # Run initialization in the event loop
66
  try:
67
+ # Check for AWS credentials
68
+ if not os.environ.get("AWS_ACCESS_KEY_ID") or not os.environ.get("AWS_SECRET_ACCESS_KEY"):
69
+ missing = []
70
+ if not os.environ.get("AWS_ACCESS_KEY_ID"):
71
+ missing.append("AWS_ACCESS_KEY_ID")
72
+ if not os.environ.get("AWS_SECRET_ACCESS_KEY"):
73
+ missing.append("AWS_SECRET_ACCESS_KEY")
74
+
75
+ error_msg = f"Missing AWS credentials: {', '.join(missing)}"
76
+ # Check if running in Hugging Face Spaces
77
+ if "SPACE_ID" in os.environ or ("SYSTEM" in os.environ and os.environ.get("SYSTEM") == "spaces"):
78
+ error_msg += "\nPlease add these as secrets in your Hugging Face Space settings."
79
+ else:
80
+ error_msg += "\nPlease set these environment variables or add them to a .env file."
81
+ raise ValueError(error_msg)
82
+
83
  # Initialize stream manager
84
+ region = os.environ.get("AWS_DEFAULT_REGION", "us-east-1")
85
+ self.stream_manager = BedrockStreamManager(model_id='amazon.nova-sonic-v1:0', region=region)
86
 
87
+ # Initialize the appropriate audio streamer based on environment
88
+ if is_huggingface_spaces() and HF_AUDIO_AVAILABLE:
89
+ print("Using Hugging Face Spaces-optimized audio streamer")
90
+ self.audio_streamer = HFAudioStreamer(self.stream_manager)
91
+ else:
92
+ print("Using standard audio streamer")
93
+ self.audio_streamer = AudioStreamer(self.stream_manager)
94
 
95
  # Initialize the stream in the event loop
96
  self.loop.run_until_complete(self._initialize_streaming())
hf_audio_utils.py ADDED
@@ -0,0 +1,208 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Audio utilities for Hugging Face Spaces integration.
3
+ This module provides audio streaming for Hugging Face Spaces environments.
4
+ """
5
+
6
+ import os
7
+ import asyncio
8
+ import numpy as np
9
+ import random
10
+ import time
11
+ import threading
12
+ import base64
13
+
14
+ # Try to import the Hugging Face-specific audio utilities
15
+ try:
16
+ from transformers.pipelines.audio_utils import ffmpeg_microphone_live
17
+ HF_AUDIO_AVAILABLE = True
18
+ except ImportError:
19
+ HF_AUDIO_AVAILABLE = False
20
+ print("Warning: transformers.pipelines.audio_utils not available, will use fallback audio simulation")
21
+
22
+ class HFAudioStreamer:
23
+ """Audio streamer for Hugging Face Spaces that works with or without real audio devices"""
24
+
25
+ def __init__(self, stream_manager):
26
+ """Initialize the HF Audio Streamer"""
27
+ self.stream_manager = stream_manager
28
+ self.is_streaming = False
29
+ self.use_ffmpeg = HF_AUDIO_AVAILABLE
30
+ self.mic_stream = None
31
+ self.mic_thread = None
32
+ self.loop = asyncio.get_event_loop()
33
+
34
+ # Check if we're in HF Spaces
35
+ self.is_hf_spaces = "SPACE_ID" in os.environ or ("SYSTEM" in os.environ and os.environ.get("SYSTEM") == "spaces")
36
+
37
+ print(f"HF Audio Streamer initialized. Using ffmpeg: {self.use_ffmpeg}, In HF Spaces: {self.is_hf_spaces}")
38
+
39
+ def _mic_thread_worker(self):
40
+ """Thread function to capture audio from ffmpeg and send it to the stream manager"""
41
+ if not self.use_ffmpeg:
42
+ return
43
+
44
+ print("Starting microphone capture using ffmpeg")
45
+
46
+ try:
47
+ # Set up the mic stream with ffmpeg
48
+ sampling_rate = 16000 # 16kHz as required by Nova Sonic
49
+ chunk_length_s = 2.0 # Process 2 seconds at a time
50
+ stream_chunk_s = 0.25 # Stream in 0.25 second chunks
51
+
52
+ # Create the mic stream
53
+ self.mic_stream = ffmpeg_microphone_live(
54
+ sampling_rate=sampling_rate,
55
+ chunk_length_s=chunk_length_s,
56
+ stream_chunk_s=stream_chunk_s,
57
+ )
58
+
59
+ # Process audio chunks
60
+ for audio_chunk in self.mic_stream:
61
+ if not self.is_streaming:
62
+ break
63
+
64
+ # Convert the float32 numpy array to int16 bytes
65
+ if isinstance(audio_chunk, np.ndarray):
66
+ # Scale from [-1.0, 1.0] to int16 range
67
+ audio_int16 = (audio_chunk * 32767).astype(np.int16)
68
+ audio_bytes = audio_int16.tobytes()
69
+
70
+ # Send to Bedrock
71
+ asyncio.run_coroutine_threadsafe(
72
+ self._send_audio_chunk(audio_bytes),
73
+ self.loop
74
+ )
75
+
76
+ except Exception as e:
77
+ print(f"Error in microphone thread: {e}")
78
+ if self.is_streaming:
79
+ # Fall back to simulated audio if ffmpeg fails
80
+ print("Falling back to simulated audio input")
81
+ self.use_ffmpeg = False
82
+ asyncio.run_coroutine_threadsafe(
83
+ self.generate_simulated_input(),
84
+ self.loop
85
+ )
86
+
87
+ async def _send_audio_chunk(self, audio_bytes):
88
+ """Send an audio chunk to the stream manager"""
89
+ if self.is_streaming and self.stream_manager and audio_bytes:
90
+ self.stream_manager.add_audio_chunk(audio_bytes)
91
+
92
+ async def generate_simulated_input(self):
93
+ """Generate simulated audio input"""
94
+ import numpy as np
95
+ print("Generating simulated audio input...")
96
+
97
+ CHUNK_SIZE = 1024 # Standard audio chunk size
98
+ CHANNELS = 1 # Mono audio
99
+
100
+ while self.is_streaming:
101
+ try:
102
+ # Generate a dummy audio chunk with some basic noise
103
+ # This simulates someone speaking into the microphone
104
+ samples = np.random.normal(0, 0.01, CHUNK_SIZE * CHANNELS).astype(np.float32)
105
+ audio_data = (samples * 32767).astype(np.int16).tobytes()
106
+
107
+ # Send to Bedrock
108
+ await self._send_audio_chunk(audio_data)
109
+
110
+ # Wait a bit between chunks
111
+ await asyncio.sleep(0.05)
112
+
113
+ # Occasionally "end" the simulated speech to get a response
114
+ if random.random() < 0.05: # 5% chance to end speech
115
+ print("Simulated speech ended, awaiting response...")
116
+ await asyncio.sleep(1.0) # Wait longer between "sentences"
117
+
118
+ except Exception as e:
119
+ if self.is_streaming:
120
+ print(f"Error generating simulated audio: {e}")
121
+ await asyncio.sleep(0.5)
122
+
123
+ async def play_output_audio(self):
124
+ """Handle audio output (in Hugging Face, we just log it)"""
125
+ while self.is_streaming:
126
+ try:
127
+ # Get audio data from the stream manager's queue
128
+ audio_data = await asyncio.wait_for(
129
+ self.stream_manager.audio_output_queue.get(),
130
+ timeout=0.1
131
+ )
132
+
133
+ if audio_data and self.is_streaming:
134
+ # In HF Spaces, just log that we received audio
135
+ audio_size = len(audio_data)
136
+ print(f"Received {audio_size} bytes of audio from Nova")
137
+
138
+ # Store the audio for potential replay
139
+ self.stream_manager.output_queue.put_nowait({
140
+ "event": {
141
+ "audioOutput": {
142
+ "content": "Audio would play here if audio devices were available"
143
+ }
144
+ }
145
+ })
146
+ except asyncio.TimeoutError:
147
+ # No message received within timeout, continue
148
+ continue
149
+ except Exception as e:
150
+ if self.is_streaming:
151
+ print(f"Error processing audio output: {str(e)}")
152
+ await asyncio.sleep(0.05)
153
+
154
+ async def start_streaming(self):
155
+ """Start streaming audio"""
156
+ if self.is_streaming:
157
+ return
158
+
159
+ print("Starting HF audio streaming...")
160
+ print("Press Enter to stop streaming...")
161
+
162
+ # Send audio content start event
163
+ await self.stream_manager.send_audio_content_start_event()
164
+
165
+ self.is_streaming = True
166
+
167
+ # Set up tasks based on mode
168
+ if self.use_ffmpeg:
169
+ # Start the ffmpeg microphone thread
170
+ self.mic_thread = threading.Thread(target=self._mic_thread_worker)
171
+ self.mic_thread.daemon = True
172
+ self.mic_thread.start()
173
+ else:
174
+ # Use simulated input
175
+ asyncio.create_task(self.generate_simulated_input())
176
+
177
+ # Always process output
178
+ output_task = asyncio.create_task(self.play_output_audio())
179
+
180
+ # Wait for user to press Enter to stop
181
+ await asyncio.get_event_loop().run_in_executor(None, input)
182
+
183
+ # Once input() returns, stop streaming
184
+ await self.stop_streaming()
185
+
186
+ async def stop_streaming(self):
187
+ """Stop streaming audio"""
188
+ if not self.is_streaming:
189
+ return
190
+
191
+ print("Stopping HF audio streaming...")
192
+ self.is_streaming = False
193
+
194
+ # Stop the ffmpeg mic stream if it's active
195
+ if self.mic_stream:
196
+ try:
197
+ self.mic_stream.close()
198
+ except:
199
+ pass
200
+ self.mic_stream = None
201
+
202
+ # Wait for the thread to finish if it exists
203
+ if self.mic_thread and self.mic_thread.is_alive():
204
+ self.mic_thread.join(timeout=2.0)
205
+ self.mic_thread = None
206
+
207
+ # Always close the stream manager
208
+ await self.stream_manager.close()
nova_sonic_tool_use.py CHANGED
@@ -19,6 +19,36 @@ try:
19
  except ImportError:
20
  print("Warning: python-dotenv not installed, using environment variables directly")
21
  pass
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
22
  from aws_sdk_bedrock_runtime.client import BedrockRuntimeClient, InvokeModelWithBidirectionalStreamOperationInput
23
  from aws_sdk_bedrock_runtime.models import InvokeModelWithBidirectionalStreamInputChunk, BidirectionalInputPayloadPart
24
  from aws_sdk_bedrock_runtime.config import Config, HTTPAuthSchemeResolver, SigV4AuthScheme
@@ -296,14 +326,34 @@ class BedrockStreamManager:
296
 
297
  def _initialize_client(self):
298
  """Initialize the Bedrock client."""
299
- config = Config(
300
- endpoint_uri=f"https://bedrock-runtime.{self.region}.amazonaws.com",
301
- region=self.region,
302
- aws_credentials_identity_resolver=EnvironmentCredentialsResolver(),
303
- http_auth_scheme_resolver=HTTPAuthSchemeResolver(),
304
- http_auth_schemes={"aws.auth#sigv4": SigV4AuthScheme()}
305
- )
306
- self.bedrock_client = BedrockRuntimeClient(config=config)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
307
 
308
  async def initialize_stream(self):
309
  """Initialize the bidirectional stream with Bedrock."""
@@ -953,24 +1003,46 @@ async def main(debug=False):
953
  global DEBUG
954
  DEBUG = debug
955
 
956
- # Create stream manager
957
- stream_manager = BedrockStreamManager(model_id='amazon.nova-sonic-v1:0', region='us-east-1')
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
958
 
959
- # Create audio streamer
960
- audio_streamer = AudioStreamer(stream_manager)
961
 
962
- # Initialize the stream
963
- await time_it_async("initialize_stream", stream_manager.initialize_stream)
964
 
965
- try:
966
  # This will run until the user presses Enter
967
  await audio_streamer.start_streaming()
968
 
969
  except KeyboardInterrupt:
970
  print("Interrupted by user")
 
 
 
 
 
971
  finally:
972
  # Clean up
973
- await audio_streamer.stop_streaming()
 
974
 
975
 
976
  if __name__ == "__main__":
 
19
  except ImportError:
20
  print("Warning: python-dotenv not installed, using environment variables directly")
21
  pass
22
+
23
+ # Check for HuggingFace Spaces environment
24
+ def is_huggingface_spaces():
25
+ """Detect if we're running on HuggingFace Spaces"""
26
+ return "SPACE_ID" in os.environ or "SYSTEM" in os.environ and os.environ.get("SYSTEM") == "spaces"
27
+
28
+ # Handle HuggingFace Spaces secrets
29
+ def setup_environment_variables():
30
+ """Set up AWS credentials from various sources including Hugging Face Spaces secrets"""
31
+ # Explicitly check for HuggingFace Spaces secrets
32
+ if is_huggingface_spaces():
33
+ print("Detected HuggingFace Spaces environment, checking for secrets...")
34
+
35
+ # In HF Spaces, secrets might be in different formats
36
+ # Check for HF_AWS_ACCESS_KEY_ID or AWS_ACCESS_KEY_ID
37
+ if os.environ.get("HF_AWS_ACCESS_KEY_ID") and not os.environ.get("AWS_ACCESS_KEY_ID"):
38
+ os.environ["AWS_ACCESS_KEY_ID"] = os.environ.get("HF_AWS_ACCESS_KEY_ID")
39
+ print("Using HF_AWS_ACCESS_KEY_ID")
40
+
41
+ if os.environ.get("HF_AWS_SECRET_ACCESS_KEY") and not os.environ.get("AWS_SECRET_ACCESS_KEY"):
42
+ os.environ["AWS_SECRET_ACCESS_KEY"] = os.environ.get("HF_AWS_SECRET_ACCESS_KEY")
43
+ print("Using HF_AWS_SECRET_ACCESS_KEY")
44
+
45
+ # Set default region if not already set
46
+ if not os.environ.get("AWS_DEFAULT_REGION"):
47
+ os.environ["AWS_DEFAULT_REGION"] = "us-east-1"
48
+ print("Set default AWS region to us-east-1")
49
+
50
+ # Set up environment variables
51
+ setup_environment_variables()
52
  from aws_sdk_bedrock_runtime.client import BedrockRuntimeClient, InvokeModelWithBidirectionalStreamOperationInput
53
  from aws_sdk_bedrock_runtime.models import InvokeModelWithBidirectionalStreamInputChunk, BidirectionalInputPayloadPart
54
  from aws_sdk_bedrock_runtime.config import Config, HTTPAuthSchemeResolver, SigV4AuthScheme
 
326
 
327
  def _initialize_client(self):
328
  """Initialize the Bedrock client."""
329
+ # Double-check AWS credentials before initializing
330
+ if not os.environ.get("AWS_ACCESS_KEY_ID") or not os.environ.get("AWS_SECRET_ACCESS_KEY"):
331
+ missing = []
332
+ if not os.environ.get("AWS_ACCESS_KEY_ID"):
333
+ missing.append("AWS_ACCESS_KEY_ID")
334
+ if not os.environ.get("AWS_SECRET_ACCESS_KEY"):
335
+ missing.append("AWS_SECRET_ACCESS_KEY")
336
+
337
+ error_msg = f"Missing AWS credentials: {', '.join(missing)}"
338
+ if is_huggingface_spaces():
339
+ error_msg += "\nPlease add these as secrets in your Hugging Face Space settings."
340
+ else:
341
+ error_msg += "\nPlease set these environment variables or add them to a .env file."
342
+ raise ValueError(error_msg)
343
+
344
+ try:
345
+ config = Config(
346
+ endpoint_uri=f"https://bedrock-runtime.{self.region}.amazonaws.com",
347
+ region=self.region,
348
+ aws_credentials_identity_resolver=EnvironmentCredentialsResolver(),
349
+ http_auth_scheme_resolver=HTTPAuthSchemeResolver(),
350
+ http_auth_schemes={"aws.auth#sigv4": SigV4AuthScheme()}
351
+ )
352
+ self.bedrock_client = BedrockRuntimeClient(config=config)
353
+ except Exception as e:
354
+ error_msg = f"Failed to initialize AWS Bedrock client: {str(e)}"
355
+ print(error_msg)
356
+ raise ValueError(error_msg)
357
 
358
  async def initialize_stream(self):
359
  """Initialize the bidirectional stream with Bedrock."""
 
1003
  global DEBUG
1004
  DEBUG = debug
1005
 
1006
+ # Check AWS credentials first
1007
+ missing_creds = []
1008
+ if not os.environ.get("AWS_ACCESS_KEY_ID"):
1009
+ missing_creds.append("AWS_ACCESS_KEY_ID")
1010
+ if not os.environ.get("AWS_SECRET_ACCESS_KEY"):
1011
+ missing_creds.append("AWS_SECRET_ACCESS_KEY")
1012
+
1013
+ if missing_creds:
1014
+ error_message = f"Missing AWS credentials: {', '.join(missing_creds)}"
1015
+ if is_huggingface_spaces():
1016
+ error_message += "\nPlease add these secrets in your Hugging Face Space's settings."
1017
+ else:
1018
+ error_message += "\nPlease set these environment variables or create a .env file."
1019
+ print(error_message)
1020
+ return
1021
+
1022
+ try:
1023
+ # Create stream manager
1024
+ stream_manager = BedrockStreamManager(model_id='amazon.nova-sonic-v1:0', region=os.environ.get("AWS_DEFAULT_REGION", "us-east-1"))
1025
 
1026
+ # Create audio streamer
1027
+ audio_streamer = AudioStreamer(stream_manager)
1028
 
1029
+ # Initialize the stream
1030
+ await time_it_async("initialize_stream", stream_manager.initialize_stream)
1031
 
 
1032
  # This will run until the user presses Enter
1033
  await audio_streamer.start_streaming()
1034
 
1035
  except KeyboardInterrupt:
1036
  print("Interrupted by user")
1037
+ except Exception as e:
1038
+ print(f"Error running application: {e}")
1039
+ if DEBUG:
1040
+ import traceback
1041
+ traceback.print_exc()
1042
  finally:
1043
  # Clean up
1044
+ if 'audio_streamer' in locals():
1045
+ await audio_streamer.stop_streaming()
1046
 
1047
 
1048
  if __name__ == "__main__":