Spaces:

Docfile
/

mrap

Sleeping

App Files Files Community

Docfile commited on Dec 31, 2024

Commit

f710c8e

verified ·

1 Parent(s): 9edaa6c

Update app.py

Browse files

Files changed (1) hide show

app.py +28 -36

app.py CHANGED Viewed

@@ -6,18 +6,16 @@ import io
 import PIL.Image
 import asyncio
 import os
 from google import genai
 from streamlit_webrtc import webrtc_streamer
 import av
-import pyaudio
 from mediapipe.tasks import python
 from mediapipe.tasks.python import vision
 # Configuration
-FORMAT = pyaudio.paInt16
 CHANNELS = 1
-SEND_SAMPLE_RATE = 16000
-RECEIVE_SAMPLE_RATE = 24000
 CHUNK_SIZE = 1024
 # Initialize Genai client
@@ -28,26 +26,32 @@ CONFIG = {"generation_config": {"response_modalities": ["AUDIO"]}}
 class AudioProcessor:
     def __init__(self):
-        self.audio = pyaudio.PyAudio()
         self.stream = None
         self.audio_queue = asyncio.Queue()
     def start_stream(self):
-        mic_info = self.audio.get_default_input_device_info()
-        self.stream = self.audio.open(
-            format=FORMAT,
-            channels=CHANNELS,
-            rate=SEND_SAMPLE_RATE,
-            input=True,
-            input_device_index=mic_info["index"],
-            frames_per_buffer=CHUNK_SIZE,
-        )
     def stop_stream(self):
-        if self.stream:
-            self.stream.stop_stream()
             self.stream.close()
-        self.stream = None
 class VideoProcessor:
     def __init__(self):
@@ -58,22 +62,17 @@ class VideoProcessor:
             min_detection_confidence=0.5)
     def video_frame_callback(self, frame):
-        # Convert the frame to RGB
         img = frame.to_ndarray(format="rgb24")
-        # Process the frame with MediaPipe
         results = self.face_detection.process(img)
-        # Draw face detection annotations if faces are detected
         if results.detections:
             for detection in results.detections:
                 self.mp_draw.draw_detection(img, detection)
-        # Convert to PIL Image
         pil_img = PIL.Image.fromarray(img)
         pil_img.thumbnail([1024, 1024])
-        # Prepare frame data for Gemini
         image_io = io.BytesIO()
         pil_img.save(image_io, format="jpeg")
         image_io.seek(0)
@@ -91,7 +90,6 @@ class VideoProcessor:
         return av.VideoFrame.from_ndarray(img, format="rgb24")
     def __del__(self):
-        # Cleanup MediaPipe resources
         if hasattr(self, 'face_detection'):
             self.face_detection.close()
@@ -113,17 +111,14 @@ def display_chat_messages():
 def main():
     st.title("Gemini Interactive Assistant")
-    # Initialize session state
     initialize_session_state()
-    # Sidebar configuration
     st.sidebar.title("Settings")
     input_mode = st.sidebar.radio(
         "Input Mode",
         ["Text Only", "Audio + Video", "Audio Only"]
     )
-    # Enable face detection option
     enable_face_detection = st.sidebar.checkbox("Enable Face Detection", value=True)
     if enable_face_detection:
@@ -140,14 +135,11 @@ def main():
             )
         )
-    # Display chat history
     display_chat_messages()
-    # Main interaction area
     if input_mode == "Text Only":
         user_input = st.chat_input("Your message")
         if user_input:
-            # Add user message to chat
             st.session_state.messages.append({"role": "user", "content": user_input})
             with st.chat_message("user"):
                 st.markdown(user_input)
@@ -158,7 +150,6 @@ def main():
                     turn = session.receive()
                     async for response in turn:
                         if text := response.text:
-                            # Add assistant response to chat
                             st.session_state.messages.append(
                                 {"role": "assistant", "content": text}
                             )
@@ -168,7 +159,6 @@ def main():
             asyncio.run(send_message())
     else:
-        # Video stream setup
         if input_mode == "Audio + Video":
             ctx = webrtc_streamer(
                 key="gemini-stream",
@@ -177,7 +167,6 @@ def main():
                 media_stream_constraints={"video": True, "audio": True},
             )
-        # Audio controls
         col1, col2 = st.columns(2)
         with col1:
             if st.button("Start Recording", type="primary"):
@@ -191,12 +180,15 @@ def main():
 async def process_audio_stream():
     while st.session_state.get('recording', False):
-        if st.session_state.audio_processor.stream:
-            data = st.session_state.audio_processor.stream.read(CHUNK_SIZE)
             await st.session_state.audio_processor.audio_queue.put({
-                "data": data,
-                "mime_type": "audio/pcm"
             })
         await asyncio.sleep(0.1)
 if __name__ == "__main__":

 import PIL.Image
 import asyncio
 import os
+import sounddevice as sd
 from google import genai
 from streamlit_webrtc import webrtc_streamer
 import av
 from mediapipe.tasks import python
 from mediapipe.tasks.python import vision
 # Configuration
 CHANNELS = 1
+SAMPLE_RATE = 16000
 CHUNK_SIZE = 1024
 # Initialize Genai client
 class AudioProcessor:
     def __init__(self):
         self.stream = None
         self.audio_queue = asyncio.Queue()
+    def audio_callback(self, indata, frames, time, status):
+        """This is called (from a separate thread) for each audio block."""
+        if status:
+            print(status)
+        self.audio_queue.put_nowait(indata.copy())
     def start_stream(self):
+        try:
+            self.stream = sd.InputStream(
+                channels=CHANNELS,
+                samplerate=SAMPLE_RATE,
+                callback=self.audio_callback,
+                blocksize=CHUNK_SIZE
+            )
+            self.stream.start()
+        except Exception as e:
+            st.error(f"Error starting audio stream: {str(e)}")
     def stop_stream(self):
+        if self.stream is not None:
+            self.stream.stop()
             self.stream.close()
+            self.stream = None
 class VideoProcessor:
     def __init__(self):
             min_detection_confidence=0.5)
     def video_frame_callback(self, frame):
         img = frame.to_ndarray(format="rgb24")
         results = self.face_detection.process(img)
         if results.detections:
             for detection in results.detections:
                 self.mp_draw.draw_detection(img, detection)
         pil_img = PIL.Image.fromarray(img)
         pil_img.thumbnail([1024, 1024])
         image_io = io.BytesIO()
         pil_img.save(image_io, format="jpeg")
         image_io.seek(0)
         return av.VideoFrame.from_ndarray(img, format="rgb24")
     def __del__(self):
         if hasattr(self, 'face_detection'):
             self.face_detection.close()
 def main():
     st.title("Gemini Interactive Assistant")
     initialize_session_state()
     st.sidebar.title("Settings")
     input_mode = st.sidebar.radio(
         "Input Mode",
         ["Text Only", "Audio + Video", "Audio Only"]
     )
     enable_face_detection = st.sidebar.checkbox("Enable Face Detection", value=True)
     if enable_face_detection:
             )
         )
     display_chat_messages()
     if input_mode == "Text Only":
         user_input = st.chat_input("Your message")
         if user_input:
             st.session_state.messages.append({"role": "user", "content": user_input})
             with st.chat_message("user"):
                 st.markdown(user_input)
                     turn = session.receive()
                     async for response in turn:
                         if text := response.text:
                             st.session_state.messages.append(
                                 {"role": "assistant", "content": text}
                             )
             asyncio.run(send_message())
     else:
         if input_mode == "Audio + Video":
             ctx = webrtc_streamer(
                 key="gemini-stream",
                 media_stream_constraints={"video": True, "audio": True},
             )
         col1, col2 = st.columns(2)
         with col1:
             if st.button("Start Recording", type="primary"):
 async def process_audio_stream():
     while st.session_state.get('recording', False):
+        try:
+            audio_data = await st.session_state.audio_processor.audio_queue.get()
             await st.session_state.audio_processor.audio_queue.put({
+                "data": audio_data.tobytes(),
+                "mime_type": "audio/pcm",
+                "sample_rate": SAMPLE_RATE
             })
+        except asyncio.QueueEmpty:
+            pass
         await asyncio.sleep(0.1)
 if __name__ == "__main__":