File size: 11,929 Bytes
2ec0d39
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
"""
Voice Agent Gradio Application
Web interface for the Voice Agent with microphone support
"""

import gradio as gr
import asyncio
import logging
import os
from .voice_agent import VoiceAgent


class VoiceApp:
    """Gradio web application for Voice Agent."""
    
    def __init__(self):
        self.agent = VoiceAgent()
        self.conversation_history = []
        
        # Set up logging
        logging.basicConfig(level=logging.INFO)
        
        # Create the interface
        self.interface = self._create_interface()
    
    def _create_interface(self):
        """Create the Gradio interface."""
        
        with gr.Blocks(
            title="🎀 Voice Agent - Secure AI Suite",
            theme=gr.themes.Soft(
                primary_hue="orange",
                secondary_hue="gray", 
                neutral_hue="slate"
            ),
            css="""
            .container { max-width: 1200px; margin: auto; }
            .chatbot { height: 500px; }
            .status-card { background: linear-gradient(90deg, #fa709a 0%, #fee140 100%); color: white; }
            .tool-card { border: 2px solid #e2e8f0; border-radius: 8px; padding: 12px; margin: 8px 0; }
            .audio-controls { text-align: center; padding: 20px; background: #f8fafc; border-radius: 8px; }
            """
        ) as app:
            
            # Header
            gr.HTML("""
            <div style='text-align: center; padding: 20px; background: linear-gradient(90deg, #fa709a 0%, #fee140 100%); color: white; border-radius: 10px;'>
                <h1 style='margin: 0; font-size: 2.5em;'>🎀 Voice Agent</h1>
                <p style='margin: 10px 0; font-size: 1.2em;'>Speech-to-AI & Text-to-Speech with Multi-modal Processing</p>
                <p style='margin: 0; opacity: 0.8;'>πŸ” Secure AI Agents Suite</p>
            </div>
            """)
            
            with gr.Row():
                # Left column - Voice interface
                with gr.Column(scale=2):
                    gr.HTML("<h3>πŸŽ™οΈ Voice Interaction</h3>")
                    
                    # Audio input/output section
                    with gr.Column():
                        gr.HTML("<div class='audio-controls'>")
                        gr.HTML("<h4>πŸŽ™οΈ Record Your Voice</h4>")
                        audio_input = gr.Audio(
                            label="Click to record or upload audio file",
                            type="filepath",
                            format="mp3",
                            elem_classes=["audio-input"]
                        )
                        
                        gr.HTML("<h4>πŸ—£οΈ AI Response (Audio)</h4>")
                        audio_output = gr.Audio(
                            label="AI response will appear here",
                            type="numpy",
                            elem_classes=["audio-output"]
                        )
                        gr.HTML("</div>")
                    
                    gr.HTML("<h3>πŸ’¬ Text Chat with Voice Features</h3>")
                    
                    chatbot = gr.Chatbot(
                        label="Voice Assistant Chat",
                        height=300,
                        elem_classes=["chatbot"],
                        avatar_images=(None, "🎀")
                    )
                    
                    with gr.Row():
                        msg_input = gr.Textbox(
                            placeholder="Type or use voice input. Try: 'Transcribe this audio' or 'Say hello in a female voice'...",
                            lines=2,
                            max_lines=4,
                            label="Your Message"
                        )
                        with gr.Column(scale=0):
                            send_btn = gr.Button("Send", variant="primary")
                            clear_btn = gr.Button("Clear", variant="secondary")
                
                # Right column - Voice Tools and Settings
                with gr.Column(scale=1):
                    gr.HTML("<h3>πŸ› οΈ Voice Services</h3>")
                    
                    tools_info = gr.HTML("""
                    <div class="tool-card">
                        <h4>πŸŽ™οΈ Speech-to-Text</h4>
                        <p>β€’ Whisper transcription<br>β€’ Multi-language support<br>β€’ High accuracy</p>
                    </div>
                    <div class="tool-card">
                        <h4>πŸ—£οΈ Text-to-Speech</h4>
                        <p>β€’ ElevenLabs synthesis<br>β€’ Natural voices<br>β€’ Emotional expression</p>
                    </div>
                    <div class="tool-card">
                        <h4>πŸ’¬ Voice Conversation</h4>
                        <p>β€’ Full-duplex chat<br>β€’ Real-time processing<br>β€’ Context awareness</p>
                    </div>
                    <div class="tool-card">
                        <h4>🌍 Multilingual</h4>
                        <p>β€’ 5+ languages<br>β€’ Auto-detection<br>β€’ Cultural adaptation</p>
                    </div>
                    """)
                    
                    gr.HTML("<h3>πŸŽ›οΈ Voice Settings</h3>")
                    with gr.Row():
                        voice_select = gr.Dropdown(
                            choices=["Adam (Male)", "Rachel (Female)", "Cloyd (Deep)", "Custom"],
                            value="Adam (Male)",
                            label="Voice Selection"
                        )
                        speed_slider = gr.Slider(0.5, 2.0, value=1.0, step=0.1, label="Speech Speed")
                    
                    gr.HTML("<h3>πŸ“Š System Status</h3>")
                    status_display = gr.HTML()
            
            # Event handlers
            def user(user_message, history):
                """Handle user input."""
                if not user_message.strip():
                    return history, ""
                
                # Add user message to history
                history.append((user_message, None))
                return history, ""
            
            async def bot_response(history, user_message):
                """Generate bot response."""
                if not user_message.strip():
                    return history
                
                # Get response from agent
                response = await self.agent.handle_user_input(user_message)
                
                # Add bot response to history
                history[-1] = (user_message, response)
                return history
            
            async def process_audio(audio_file):
                """Process uploaded or recorded audio."""
                if not audio_file:
                    return None, "No audio file provided"
                
                try:
                    # Process audio with voice agent
                    response = await self.agent.handle_user_input("process this audio file")
                    return audio_file, response
                except Exception as e:
                    return audio_file, f"Error processing audio: {str(e)}"
            
            async def text_to_speech(text, voice_style, speed):
                """Convert text to speech."""
                if not text.strip():
                    return None, "No text provided"
                
                try:
                    # Process with voice synthesis
                    voice_prompt = f"speak: {text} with {voice_style} voice at {speed}x speed"
                    response = await self.agent.handle_user_input(voice_prompt)
                    
                    # Generate mock audio file path
                    audio_path = f"temp_audio_{hash(text)}.mp3"
                    
                    return audio_path, response
                except Exception as e:
                    return None, f"Error generating speech: {str(e)}"
            
            def clear_conversation():
                """Clear conversation history."""
                return []
            
            def update_status():
                """Update status display."""
                status = self.agent.get_status()
                voice_settings = self.agent.config.get("voice_settings", {})
                return f"""
                <div class="status-card" style="padding: 15px; border-radius: 8px;">
                    <h4>βœ… Voice System Status</h4>
                    <p><strong>Agent:</strong> {status['name']}</p>
                    <p><strong>Status:</strong> {status['status']}</p>
                    <p><strong>Whisper:</strong> {voice_settings.get('whisper_model', 'whisper-1')}</p>
                    <p><strong>ElevenLabs:</strong> Active</p>
                    <p><strong>Languages:</strong> 5+ supported</p>
                    <p><strong>Security:</strong> {'πŸ›‘οΈ Enabled' if status['security_enabled'] else '❌ Disabled'}</p>
                </div>
                """
            
            # Connect events
            send_btn.click(
                user,
                inputs=[msg_input, chatbot],
                outputs=[chatbot, msg_input]
            ).then(
                bot_response,
                inputs=[chatbot, msg_input],
                outputs=[chatbot]
            )
            
            msg_input.submit(
                user,
                inputs=[msg_input, chatbot],
                outputs=[chatbot, msg_input]
            ).then(
                bot_response,
                inputs=[chatbot, msg_input],
                outputs=[chatbot]
            )
            
            # Audio processing
            audio_input.change(
                process_audio,
                inputs=[audio_input],
                outputs=[audio_output, chatbot]
            )
            
            # Text-to-speech generation
            def generate_speech(text, voice, speed):
                return text_to_speech(text, voice, speed)
            
            clear_btn.click(clear_conversation, outputs=chatbot)
            
            # Initial status update
            app.load(update_status, outputs=status_display)
        
        return app
    
    def launch(self, **kwargs):
        """Launch the Gradio application."""
        self.interface.launch(
            server_name="0.0.0.0",
            server_port=7863,
            share=False,
            show_error=True,
            quiet=False,
            **kwargs
        )


# Example usage and quick commands
EXAMPLE_QUERIES = [
    "Transcribe this audio file",
    "Say 'Hello, welcome to our voice AI' in a female voice",
    "Start a voice conversation",
    "Analyze the sentiment of this audio",
    "Search for meeting recordings about project updates",
    "Enable multilingual voice mode"
]


def main():
    """Main function to run the Voice Agent app."""
    print("🎀 Starting Voice Agent...")
    print("πŸŽ™οΈ Initializing Whisper (Speech-to-Text)...")
    print("πŸ—£οΈ Loading ElevenLabs (Text-to-Speech)...")
    print("🧠 Connecting AI models (GPT-4o, Gemini)...")
    print("🌍 Setting up multilingual support...")
    
    app = VoiceApp()
    
    print("\n" + "="*60)
    print("🎀 VOICE AGENT - SPEECH PROCESSING SUITE")
    print("="*60)
    print("\nπŸ’‘ Example voice requests you can try:")
    for i, query in enumerate(EXAMPLE_QUERIES, 1):
        print(f"   {i}. {query}")
    print("\nπŸŽ™οΈ Features:")
    print("   β€’ Record your voice or upload audio files")
    print("   β€’ Convert text to natural-sounding speech")
    print("   β€’ Full voice conversations with AI")
    print("   β€’ Multi-language support (English, Spanish, Nepali, etc.)")
    print("\n🌐 Starting Gradio server...")
    print("πŸ”— Open your browser to: http://localhost:7863")
    print("\n" + "="*60)
    
    app.launch()


if __name__ == "__main__":
    main()