Spaces:
Running
on
Zero
Running
on
Zero
| """ | |
| Voice Agent Gradio Application | |
| Web interface for the Voice Agent with microphone support | |
| """ | |
| import gradio as gr | |
| import asyncio | |
| import logging | |
| import os | |
| from .voice_agent import VoiceAgent | |
| class VoiceApp: | |
| """Gradio web application for Voice Agent.""" | |
| def __init__(self): | |
| self.agent = VoiceAgent() | |
| self.conversation_history = [] | |
| # Set up logging | |
| logging.basicConfig(level=logging.INFO) | |
| # Create the interface | |
| self.interface = self._create_interface() | |
| def _create_interface(self): | |
| """Create the Gradio interface.""" | |
| with gr.Blocks( | |
| title="π€ Voice Agent - Secure AI Suite", | |
| theme=gr.themes.Soft( | |
| primary_hue="orange", | |
| secondary_hue="gray", | |
| neutral_hue="slate" | |
| ), | |
| css=""" | |
| .container { max-width: 1200px; margin: auto; } | |
| .chatbot { height: 500px; } | |
| .status-card { background: linear-gradient(90deg, #fa709a 0%, #fee140 100%); color: white; } | |
| .tool-card { border: 2px solid #e2e8f0; border-radius: 8px; padding: 12px; margin: 8px 0; } | |
| .audio-controls { text-align: center; padding: 20px; background: #f8fafc; border-radius: 8px; } | |
| """ | |
| ) as app: | |
| # Header | |
| gr.HTML(""" | |
| <div style='text-align: center; padding: 20px; background: linear-gradient(90deg, #fa709a 0%, #fee140 100%); color: white; border-radius: 10px;'> | |
| <h1 style='margin: 0; font-size: 2.5em;'>π€ Voice Agent</h1> | |
| <p style='margin: 10px 0; font-size: 1.2em;'>Speech-to-AI & Text-to-Speech with Multi-modal Processing</p> | |
| <p style='margin: 0; opacity: 0.8;'>π Secure AI Agents Suite</p> | |
| </div> | |
| """) | |
| with gr.Row(): | |
| # Left column - Voice interface | |
| with gr.Column(scale=2): | |
| gr.HTML("<h3>ποΈ Voice Interaction</h3>") | |
| # Audio input/output section | |
| with gr.Column(): | |
| gr.HTML("<div class='audio-controls'>") | |
| gr.HTML("<h4>ποΈ Record Your Voice</h4>") | |
| audio_input = gr.Audio( | |
| label="Click to record or upload audio file", | |
| type="filepath", | |
| format="mp3", | |
| elem_classes=["audio-input"] | |
| ) | |
| gr.HTML("<h4>π£οΈ AI Response (Audio)</h4>") | |
| audio_output = gr.Audio( | |
| label="AI response will appear here", | |
| type="numpy", | |
| elem_classes=["audio-output"] | |
| ) | |
| gr.HTML("</div>") | |
| gr.HTML("<h3>π¬ Text Chat with Voice Features</h3>") | |
| chatbot = gr.Chatbot( | |
| label="Voice Assistant Chat", | |
| height=300, | |
| elem_classes=["chatbot"], | |
| avatar_images=(None, "π€") | |
| ) | |
| with gr.Row(): | |
| msg_input = gr.Textbox( | |
| placeholder="Type or use voice input. Try: 'Transcribe this audio' or 'Say hello in a female voice'...", | |
| lines=2, | |
| max_lines=4, | |
| label="Your Message" | |
| ) | |
| with gr.Column(scale=0): | |
| send_btn = gr.Button("Send", variant="primary") | |
| clear_btn = gr.Button("Clear", variant="secondary") | |
| # Right column - Voice Tools and Settings | |
| with gr.Column(scale=1): | |
| gr.HTML("<h3>π οΈ Voice Services</h3>") | |
| tools_info = gr.HTML(""" | |
| <div class="tool-card"> | |
| <h4>ποΈ Speech-to-Text</h4> | |
| <p>β’ Whisper transcription<br>β’ Multi-language support<br>β’ High accuracy</p> | |
| </div> | |
| <div class="tool-card"> | |
| <h4>π£οΈ Text-to-Speech</h4> | |
| <p>β’ ElevenLabs synthesis<br>β’ Natural voices<br>β’ Emotional expression</p> | |
| </div> | |
| <div class="tool-card"> | |
| <h4>π¬ Voice Conversation</h4> | |
| <p>β’ Full-duplex chat<br>β’ Real-time processing<br>β’ Context awareness</p> | |
| </div> | |
| <div class="tool-card"> | |
| <h4>π Multilingual</h4> | |
| <p>β’ 5+ languages<br>β’ Auto-detection<br>β’ Cultural adaptation</p> | |
| </div> | |
| """) | |
| gr.HTML("<h3>ποΈ Voice Settings</h3>") | |
| with gr.Row(): | |
| voice_select = gr.Dropdown( | |
| choices=["Adam (Male)", "Rachel (Female)", "Cloyd (Deep)", "Custom"], | |
| value="Adam (Male)", | |
| label="Voice Selection" | |
| ) | |
| speed_slider = gr.Slider(0.5, 2.0, value=1.0, step=0.1, label="Speech Speed") | |
| gr.HTML("<h3>π System Status</h3>") | |
| status_display = gr.HTML() | |
| # Event handlers | |
| def user(user_message, history): | |
| """Handle user input.""" | |
| if not user_message.strip(): | |
| return history, "" | |
| # Add user message to history | |
| history.append((user_message, None)) | |
| return history, "" | |
| async def bot_response(history, user_message): | |
| """Generate bot response.""" | |
| if not user_message.strip(): | |
| return history | |
| # Get response from agent | |
| response = await self.agent.handle_user_input(user_message) | |
| # Add bot response to history | |
| history[-1] = (user_message, response) | |
| return history | |
| async def process_audio(audio_file): | |
| """Process uploaded or recorded audio.""" | |
| if not audio_file: | |
| return None, "No audio file provided" | |
| try: | |
| # Process audio with voice agent | |
| response = await self.agent.handle_user_input("process this audio file") | |
| return audio_file, response | |
| except Exception as e: | |
| return audio_file, f"Error processing audio: {str(e)}" | |
| async def text_to_speech(text, voice_style, speed): | |
| """Convert text to speech.""" | |
| if not text.strip(): | |
| return None, "No text provided" | |
| try: | |
| # Process with voice synthesis | |
| voice_prompt = f"speak: {text} with {voice_style} voice at {speed}x speed" | |
| response = await self.agent.handle_user_input(voice_prompt) | |
| # Generate mock audio file path | |
| audio_path = f"temp_audio_{hash(text)}.mp3" | |
| return audio_path, response | |
| except Exception as e: | |
| return None, f"Error generating speech: {str(e)}" | |
| def clear_conversation(): | |
| """Clear conversation history.""" | |
| return [] | |
| def update_status(): | |
| """Update status display.""" | |
| status = self.agent.get_status() | |
| voice_settings = self.agent.config.get("voice_settings", {}) | |
| return f""" | |
| <div class="status-card" style="padding: 15px; border-radius: 8px;"> | |
| <h4>β Voice System Status</h4> | |
| <p><strong>Agent:</strong> {status['name']}</p> | |
| <p><strong>Status:</strong> {status['status']}</p> | |
| <p><strong>Whisper:</strong> {voice_settings.get('whisper_model', 'whisper-1')}</p> | |
| <p><strong>ElevenLabs:</strong> Active</p> | |
| <p><strong>Languages:</strong> 5+ supported</p> | |
| <p><strong>Security:</strong> {'π‘οΈ Enabled' if status['security_enabled'] else 'β Disabled'}</p> | |
| </div> | |
| """ | |
| # Connect events | |
| send_btn.click( | |
| user, | |
| inputs=[msg_input, chatbot], | |
| outputs=[chatbot, msg_input] | |
| ).then( | |
| bot_response, | |
| inputs=[chatbot, msg_input], | |
| outputs=[chatbot] | |
| ) | |
| msg_input.submit( | |
| user, | |
| inputs=[msg_input, chatbot], | |
| outputs=[chatbot, msg_input] | |
| ).then( | |
| bot_response, | |
| inputs=[chatbot, msg_input], | |
| outputs=[chatbot] | |
| ) | |
| # Audio processing | |
| audio_input.change( | |
| process_audio, | |
| inputs=[audio_input], | |
| outputs=[audio_output, chatbot] | |
| ) | |
| # Text-to-speech generation | |
| def generate_speech(text, voice, speed): | |
| return text_to_speech(text, voice, speed) | |
| clear_btn.click(clear_conversation, outputs=chatbot) | |
| # Initial status update | |
| app.load(update_status, outputs=status_display) | |
| return app | |
| def launch(self, **kwargs): | |
| """Launch the Gradio application.""" | |
| self.interface.launch( | |
| server_name="0.0.0.0", | |
| server_port=7863, | |
| share=False, | |
| show_error=True, | |
| quiet=False, | |
| **kwargs | |
| ) | |
| # Example usage and quick commands | |
| EXAMPLE_QUERIES = [ | |
| "Transcribe this audio file", | |
| "Say 'Hello, welcome to our voice AI' in a female voice", | |
| "Start a voice conversation", | |
| "Analyze the sentiment of this audio", | |
| "Search for meeting recordings about project updates", | |
| "Enable multilingual voice mode" | |
| ] | |
| def main(): | |
| """Main function to run the Voice Agent app.""" | |
| print("π€ Starting Voice Agent...") | |
| print("ποΈ Initializing Whisper (Speech-to-Text)...") | |
| print("π£οΈ Loading ElevenLabs (Text-to-Speech)...") | |
| print("π§ Connecting AI models (GPT-4o, Gemini)...") | |
| print("π Setting up multilingual support...") | |
| app = VoiceApp() | |
| print("\n" + "="*60) | |
| print("π€ VOICE AGENT - SPEECH PROCESSING SUITE") | |
| print("="*60) | |
| print("\nπ‘ Example voice requests you can try:") | |
| for i, query in enumerate(EXAMPLE_QUERIES, 1): | |
| print(f" {i}. {query}") | |
| print("\nποΈ Features:") | |
| print(" β’ Record your voice or upload audio files") | |
| print(" β’ Convert text to natural-sounding speech") | |
| print(" β’ Full voice conversations with AI") | |
| print(" β’ Multi-language support (English, Spanish, Nepali, etc.)") | |
| print("\nπ Starting Gradio server...") | |
| print("π Open your browser to: http://localhost:7863") | |
| print("\n" + "="*60) | |
| app.launch() | |
| if __name__ == "__main__": | |
| main() |