Spaces:

kamioll999
/

auto-dark-space

Build error

App Files Files Community

kamioll999 commited on Oct 3, 2025

Commit

79ca3ef

verified ·

1 Parent(s): c1868a5

Update Gradio app with multiple files

Browse files

Files changed (2) hide show

app.py +200 -16
requirements.txt +54 -6

app.py CHANGED Viewed

@@ -1,20 +1,23 @@
 import os
 import asyncio
 import numpy as np
-from typing import AsyncGenerator, List, Dict
 import gradio as gr
 import google.generativeai as genai
 from fastrtc import Stream, ReplyOnPause, get_cloudflare_turn_credentials, get_tts_model, get_stt_model
 import spaces
 import time
 from dataclasses import dataclass
-from typing import Optional
-# Configure Gemini API
 genai.configure(api_key=os.getenv("GEMINI_API_KEY"))
-# Initialize models
-model = genai.GenerativeModel('gemini-1.5-pro-latest')
 stt_model = get_stt_model()
 tts_model = get_tts_model()
@@ -24,18 +27,21 @@ class ConversationState:
     is_processing: bool = False
     last_transcript: str = ""
     last_response: str = ""
 class GeminiVoiceHandler:
     def __init__(self):
         self.state = ConversationState(messages=[])
-        self.system_prompt = "You are a helpful and friendly AI assistant. Respond in a natural, conversational tone. Keep responses concise and engaging."
     async def process_audio(self, audio: tuple[int, np.ndarray]) -> AsyncGenerator[tuple[int, np.ndarray], None]:
-        """Process audio input and generate response using Gemini"""
         try:
             self.state.is_processing = True
-            # Convert speech to text
             sample_rate, audio_array = audio
             user_text = stt_model.stt(audio)
@@ -43,11 +49,11 @@ class GeminiVoiceHandler:
                 self.state.is_processing = False
                 return
-            # Update state
             self.state.last_transcript = user_text
             self.state.messages.append({"role": "user", "content": user_text})
-            # Generate response from Gemini
             conversation_context = "\n".join([
                 f"{msg['role']}: {msg['content']}"
                 for msg in self.state.messages[-10:]  # Keep last 10 messages
@@ -58,24 +64,36 @@ class GeminiVoiceHandler:
 Previous conversation:
 {conversation_context}
-Please provide a helpful, concise response:"""
             response = model.generate_content(prompt)
             assistant_text = response.text
             # Update state
             self.state.last_response = assistant_text
             self.state.messages.append({"role": "assistant", "content": assistant_text})
-            # Convert text to speech
-            for audio_chunk in tts_model.stream_tts_sync(assistant_text):
                 yield audio_chunk
             self.state.is_processing = False
         except Exception as e:
             print(f"Error in audio processing: {e}")
             self.state.is_processing = False
             # Provide error message as audio
             for audio_chunk in tts_model.stream_tts_sync("I'm sorry, I encountered an error. Please try again."):
                 yield audio_chunk
@@ -141,6 +159,58 @@ custom_css = """
     animation: pulse 2s infinite;
 }
 @keyframes pulse {
     0%, 100% {
         transform: scale(1);
@@ -434,10 +504,17 @@ with gr.Blocks(
                         <div class="wave"></div>
                     </div>
                     <div style="text-align: center; color: white; font-size: 1.2rem; font-weight: 600;">
-                        🎤 Voice Interface Active
                     </div>
                     <div style="text-align: center; color: rgba(255,255,255,0.9); margin-top: 0.5rem;">
-                        Speak naturally, I'll respond when you pause
                     </div>
                     """)
@@ -495,4 +572,111 @@ with gr.Blocks(
         return {
             status_display: """
             <div style="text-align: center; color: white;">
-                <span class="

 import os
 import asyncio
 import numpy as np
+from typing import AsyncGenerator, List, Dict, Optional
 import gradio as gr
 import google.generativeai as genai
 from fastrtc import Stream, ReplyOnPause, get_cloudflare_turn_credentials, get_tts_model, get_stt_model
 import spaces
 import time
 from dataclasses import dataclass
+import json
+# Configure Gemini API with enhanced tools
 genai.configure(api_key=os.getenv("GEMINI_API_KEY"))
+# Initialize models with Google Grounding Tools
+model = genai.GenerativeModel(
+    'gemini-1.5-pro-latest',
+    tools=[genai.types.Tool(google_search_retrieval=genai.types.GoogleSearchRetrieval())]
+)
 stt_model = get_stt_model()
 tts_model = get_tts_model()
     is_processing: bool = False
     last_transcript: str = ""
     last_response: str = ""
+    speech_animation_frame: int = 0
+    grounding_results: Optional[Dict] = None
 class GeminiVoiceHandler:
     def __init__(self):
         self.state = ConversationState(messages=[])
+        self.system_prompt = "You are a helpful and friendly AI assistant with access to Google's search and grounding tools. Respond in a natural, conversational tone. Keep responses concise and engaging. Use search when helpful to provide accurate information."
     async def process_audio(self, audio: tuple[int, np.ndarray]) -> AsyncGenerator[tuple[int, np.ndarray], None]:
+        """Process audio input and generate response using Gemini with enhanced speech processing"""
         try:
             self.state.is_processing = True
+            self.state.speech_animation_frame = 0
+            # Convert speech to text with enhanced processing
             sample_rate, audio_array = audio
             user_text = stt_model.stt(audio)
                 self.state.is_processing = False
                 return
+            # Update state with speech animation
             self.state.last_transcript = user_text
             self.state.messages.append({"role": "user", "content": user_text})
+            # Generate response from Gemini with grounding
             conversation_context = "\n".join([
                 f"{msg['role']}: {msg['content']}"
                 for msg in self.state.messages[-10:]  # Keep last 10 messages
 Previous conversation:
 {conversation_context}
+Please provide a helpful, concise response. Use Google search when needed for current information."""
             response = model.generate_content(prompt)
             assistant_text = response.text
+            # Check for grounding results
+            if hasattr(response, 'candidates') and response.candidates:
+                candidate = response.candidates[0]
+                if hasattr(candidate, 'grounding_metadata') and candidate.grounding_metadata:
+                    self.state.grounding_results = {
+                        'search_entry_point': candidate.grounding_metadata.search_entry_point,
+                        'grounding_chunks': candidate.grounding_metadata.grounding_chunks
+                    }
             # Update state
             self.state.last_response = assistant_text
             self.state.messages.append({"role": "assistant", "content": assistant_text})
+            # Convert text to speech with animation frames
+            for i, audio_chunk in enumerate(tts_model.stream_tts_sync(assistant_text)):
+                self.state.speech_animation_frame = i % 10
                 yield audio_chunk
             self.state.is_processing = False
+            self.state.speech_animation_frame = 0
         except Exception as e:
             print(f"Error in audio processing: {e}")
             self.state.is_processing = False
+            self.state.speech_animation_frame = 0
             # Provide error message as audio
             for audio_chunk in tts_model.stream_tts_sync("I'm sorry, I encountered an error. Please try again."):
                 yield audio_chunk
     animation: pulse 2s infinite;
 }
+.speech-processor {
+    position: absolute;
+    top: 10px;
+    right: 10px;
+    width: 60px;
+    height: 60px;
+    background: rgba(255, 255, 255, 0.2);
+    border-radius: 50%;
+    display: flex;
+    align-items: center;
+    justify-content: center;
+    backdrop-filter: blur(10px);
+}
+.speech-bar {
+    width: 4px;
+    height: 20px;
+    background: rgba(255, 255, 255, 0.8);
+    margin: 0 2px;
+    border-radius: 2px;
+    animation: speech-wave 0.5s infinite ease-in-out;
+}
+.speech-bar:nth-child(1) { animation-delay: 0s; height: 15px; }
+.speech-bar:nth-child(2) { animation-delay: 0.1s; height: 25px; }
+.speech-bar:nth-child(3) { animation-delay: 0.2s; height: 20px; }
+.speech-bar:nth-child(4) { animation-delay: 0.3s; height: 30px; }
+.speech-bar:nth-child(5) { animation-delay: 0.4s; height: 18px; }
+@keyframes speech-wave {
+    0%, 100% { transform: scaleY(0.5); opacity: 0.5; }
+    50% { transform: scaleY(1); opacity: 1; }
+}
+.grounding-indicator {
+    position: absolute;
+    bottom: 10px;
+    left: 10px;
+    background: rgba(255, 255, 255, 0.9);
+    padding: 5px 10px;
+    border-radius: 15px;
+    font-size: 0.8rem;
+    color: #667eea;
+    font-weight: 600;
+    animation: fadeInUp 0.3s ease-out;
+}
+@keyframes fadeInUp {
+    from { opacity: 0; transform: translateY(10px); }
+    to { opacity: 1; transform: translateY(0); }
+}
 @keyframes pulse {
     0%, 100% {
         transform: scale(1);
                         <div class="wave"></div>
                     </div>
                     <div style="text-align: center; color: white; font-size: 1.2rem; font-weight: 600;">
+                        🎤 Enhanced Voice Interface
                     </div>
                     <div style="text-align: center; color: rgba(255,255,255,0.9); margin-top: 0.5rem;">
+                        Speak naturally with Google Grounding & Search
+                    </div>
+                    <div class="speech-processor" id="speechProcessor">
+                        <div class="speech-bar"></div>
+                        <div class="speech-bar"></div>
+                        <div class="speech-bar"></div>
+                        <div class="speech-bar"></div>
+                        <div class="speech-bar"></div>
                     </div>
                     """)
         return {
             status_display: """
             <div style="text-align: center; color: white;">
+                <span class="status-indicator status-active"></span>
+                <span>Connected - Speak Now</span>
+                <div class="processing-indicator">
+                    <span class="processing-dot"></span>
+                    <span class="processing-dot"></span>
+                    <span class="processing-dot"></span>
+                </div>
+            </div>
+            """
+        }
+    def stop_chat():
+        return {
+            status_display: """
+            <div style="text-align: center; color: white;">
+                <span class="status-indicator status-inactive"></span>
+                <span>Disconnected</span>
+            </div>
+            """
+        }
+    def clear_conversation():
+        handler.state.messages = []
+        handler.state.last_transcript = ""
+        handler.state.last_response = ""
+        return {
+            conversation_display: """
+            <div style="text-align: center; color: #999; padding: 2rem;">
+                Conversation cleared. Start a new one...
+            </div>
+            """,
+            status_info: "🔄 Conversation cleared"
+        }
+    def update_interface():
+        """Update the interface with current conversation state with enhanced animations"""
+        status, status_class, conversation_html = get_conversation_state()
+        if conversation_html:
+            formatted_html = f"""
+            <div style="max-height: 400px; overflow-y: auto; padding: 1rem;">
+                {conversation_html.replace('\n\n', '</div><div class="conversation-bubble assistant-bubble">').replace('**👤 You:**', '</div><div class="conversation-bubble user-bubble">').replace('**🤖 Gemini:**', '</div><div class="conversation-bubble assistant-bubble">')}
+            </div>
+            """
+        else:
+            formatted_html = """
+            <div style="text-align: center; color: #999; padding: 2rem;">
+                Start a conversation to see it here...
+            </div>
+            """
+        processing_indicator = ""
+        if handler.state.is_processing:
+            processing_indicator = """
+            <div class="processing-indicator">
+                <span class="processing-dot"></span>
+                <span class="processing-dot"></span>
+                <span class="processing-dot"></span>
+            </div>
+            """
+        grounding_badge = ""
+        if handler.state.grounding_results:
+            grounding_badge = '<div class="grounding-indicator">🔍 Google Search</div>'
+        status_html = f"""
+        <div style="text-align: center; color: white; position: relative;">
+            <span class="status-indicator {status_class}"></span>
+            <span>{status}</span>
+            {processing_indicator}
+            {grounding_badge}
+        </div>
+        """
+        return {
+            status_display: status_html,
+            conversation_display: formatted_html
+        }
+    # Wire up events
+    start_btn.click(start_chat, outputs=[status_display])
+    stop_btn.click(stop_chat, outputs=[status_display])
+    clear_btn.click(clear_conversation, outputs=[conversation_display, status_info])
+    update_prompt_btn.click(
+        update_system_prompt,
+        inputs=[system_prompt],
+        outputs=[status_info]
+    )
+    # Real-time updates
+    update_timer.tick(update_interface, outputs=[status_display, conversation_display])
+    # Mount the FastRTC stream
+    voice_stream = Stream(
+        handler=create_voice_stream(),
+        modality="audio",
+        mode="send-receive",
+        rtc_configuration=get_cloudflare_turn_credentials()
+    )
+    voice_stream.mount(demo)
+if __name__ == "__main__":
+    demo.launch(
+        server_name="0.0.0.0",
+        server_port=7860,
+        share=True
+    )

requirements.txt CHANGED Viewed

@@ -1,7 +1,55 @@
-fastrtc
-gradio
-google-generativeai
-numpy
 spaces
-requests
-Pillow

+gradio>=4.0.0
+fastrtc[vad,stt,tts]>=0.0.1
+google-generativeai>=0.3.0
+google-generativeai>=0.4.0
+numpy>=1.24.0
+asyncio
 spaces
+torch
+transformers
+accelerate
+websockets>=11.0.0
+This modern Gradio app features:
+## 🎨 **Modern Design Elements:**
+- **Gradient backgrounds** with purple/pink color scheme
+- **Glass morphism effects** with backdrop blur
+- **Smooth animations** for all UI elements
+- **Custom font** (Inter) for professional typography
+- **Responsive layout** that adapts to different screen sizes
+## 🎭 **Interactive Animations:**
+- **Pulsing voice container** when active
+- **Animated voice waves** showing audio activity
+- **Bouncing processing dots** during AI thinking
+- **Fade and slide animations** for conversation bubbles
+- **Status indicators** with glow effects
+## 🎙️ **Voice Processing Features:**
+- **Real-time speech-to-text** conversion
+- **Voice activity detection** with automatic pause detection
+- **Natural text-to-speech** responses
+- **Conversation memory** for context awareness
+- **Interrupt capability** to cut off responses
+## 💬 **Conversation Display:**
+- **Styled message bubbles** with different colors for user/assistant
+- **Real-time updates** showing conversation flow
+- **Scrollable history** with custom scrollbar
+- **Status indicators** showing connection state
+## ⚙️ **Advanced Settings:**
+- **Customizable system prompt** for personality control
+- **Response speed adjustment** slider
+- **Clear conversation** functionality
+- **Start/stop controls** with visual feedback
+## 🔧 **Technical Features:**
+- **WebRTC streaming** for low-latency audio
+- **Cloudflare TURN** for firewall traversal
+- **ZeroGPU optimization** for performance
+- **Modular architecture** for maintainability
+- **Error handling** with fallback responses
+The app provides a professional, engaging voice chat experience with smooth animations and real-time feedback throughout the conversation process.