Spaces:

rajkumarrawal
/

Secure-AI-Agents-Suite

Sleeping

File size: 11,929 Bytes

2ec0d39

"""
Voice Agent Gradio Application
Web interface for the Voice Agent with microphone support
"""

import gradio as gr
import asyncio
import logging
import os
from .voice_agent import VoiceAgent


class VoiceApp:
    """Gradio web application for Voice Agent."""
    
    def __init__(self):
        self.agent = VoiceAgent()
        self.conversation_history = []
        
        # Set up logging
        logging.basicConfig(level=logging.INFO)
        
        # Create the interface
        self.interface = self._create_interface()
    
    def _create_interface(self):
        """Create the Gradio interface."""
        
        with gr.Blocks(
            title="🎤 Voice Agent - Secure AI Suite",
            theme=gr.themes.Soft(
                primary_hue="orange",
                secondary_hue="gray", 
                neutral_hue="slate"
            ),
            css="""
            .container { max-width: 1200px; margin: auto; }
            .chatbot { height: 500px; }
            .status-card { background: linear-gradient(90deg, #fa709a 0%, #fee140 100%); color: white; }
            .tool-card { border: 2px solid #e2e8f0; border-radius: 8px; padding: 12px; margin: 8px 0; }
            .audio-controls { text-align: center; padding: 20px; background: #f8fafc; border-radius: 8px; }
            """
        ) as app:
            
            # Header
            gr.HTML("""
            <div style='text-align: center; padding: 20px; background: linear-gradient(90deg, #fa709a 0%, #fee140 100%); color: white; border-radius: 10px;'>
                <h1 style='margin: 0; font-size: 2.5em;'>🎤 Voice Agent</h1>
                <p style='margin: 10px 0; font-size: 1.2em;'>Speech-to-AI & Text-to-Speech with Multi-modal Processing</p>
                <p style='margin: 0; opacity: 0.8;'>🔐 Secure AI Agents Suite</p>
            </div>
            """)
            
            with gr.Row():
                # Left column - Voice interface
                with gr.Column(scale=2):
                    gr.HTML("<h3>🎙️ Voice Interaction</h3>")
                    
                    # Audio input/output section
                    with gr.Column():
                        gr.HTML("<div class='audio-controls'>")
                        gr.HTML("<h4>🎙️ Record Your Voice</h4>")
                        audio_input = gr.Audio(
                            label="Click to record or upload audio file",
                            type="filepath",
                            format="mp3",
                            elem_classes=["audio-input"]
                        )
                        
                        gr.HTML("<h4>🗣️ AI Response (Audio)</h4>")
                        audio_output = gr.Audio(
                            label="AI response will appear here",
                            type="numpy",
                            elem_classes=["audio-output"]
                        )
                        gr.HTML("</div>")
                    
                    gr.HTML("<h3>💬 Text Chat with Voice Features</h3>")
                    
                    chatbot = gr.Chatbot(
                        label="Voice Assistant Chat",
                        height=300,
                        elem_classes=["chatbot"],
                        avatar_images=(None, "🎤")
                    )
                    
                    with gr.Row():
                        msg_input = gr.Textbox(
                            placeholder="Type or use voice input. Try: 'Transcribe this audio' or 'Say hello in a female voice'...",
                            lines=2,
                            max_lines=4,
                            label="Your Message"
                        )
                        with gr.Column(scale=0):
                            send_btn = gr.Button("Send", variant="primary")
                            clear_btn = gr.Button("Clear", variant="secondary")
                
                # Right column - Voice Tools and Settings
                with gr.Column(scale=1):
                    gr.HTML("<h3>🛠️ Voice Services</h3>")
                    
                    tools_info = gr.HTML("""
                    <div class="tool-card">
                        <h4>🎙️ Speech-to-Text</h4>
                        <p>• Whisper transcription<br>• Multi-language support<br>• High accuracy</p>
                    </div>
                    <div class="tool-card">
                        <h4>🗣️ Text-to-Speech</h4>
                        <p>• ElevenLabs synthesis<br>• Natural voices<br>• Emotional expression</p>
                    </div>
                    <div class="tool-card">
                        <h4>💬 Voice Conversation</h4>
                        <p>• Full-duplex chat<br>• Real-time processing<br>• Context awareness</p>
                    </div>
                    <div class="tool-card">
                        <h4>🌍 Multilingual</h4>
                        <p>• 5+ languages<br>• Auto-detection<br>• Cultural adaptation</p>
                    </div>
                    """)
                    
                    gr.HTML("<h3>🎛️ Voice Settings</h3>")
                    with gr.Row():
                        voice_select = gr.Dropdown(
                            choices=["Adam (Male)", "Rachel (Female)", "Cloyd (Deep)", "Custom"],
                            value="Adam (Male)",
                            label="Voice Selection"
                        )
                        speed_slider = gr.Slider(0.5, 2.0, value=1.0, step=0.1, label="Speech Speed")
                    
                    gr.HTML("<h3>📊 System Status</h3>")
                    status_display = gr.HTML()
            
            # Event handlers
            def user(user_message, history):
                """Handle user input."""
                if not user_message.strip():
                    return history, ""
                
                # Add user message to history
                history.append((user_message, None))
                return history, ""
            
            async def bot_response(history, user_message):
                """Generate bot response."""
                if not user_message.strip():
                    return history
                
                # Get response from agent
                response = await self.agent.handle_user_input(user_message)
                
                # Add bot response to history
                history[-1] = (user_message, response)
                return history
            
            async def process_audio(audio_file):
                """Process uploaded or recorded audio."""
                if not audio_file:
                    return None, "No audio file provided"
                
                try:
                    # Process audio with voice agent
                    response = await self.agent.handle_user_input("process this audio file")
                    return audio_file, response
                except Exception as e:
                    return audio_file, f"Error processing audio: {str(e)}"
            
            async def text_to_speech(text, voice_style, speed):
                """Convert text to speech."""
                if not text.strip():
                    return None, "No text provided"
                
                try:
                    # Process with voice synthesis
                    voice_prompt = f"speak: {text} with {voice_style} voice at {speed}x speed"
                    response = await self.agent.handle_user_input(voice_prompt)
                    
                    # Generate mock audio file path
                    audio_path = f"temp_audio_{hash(text)}.mp3"
                    
                    return audio_path, response
                except Exception as e:
                    return None, f"Error generating speech: {str(e)}"
            
            def clear_conversation():
                """Clear conversation history."""
                return []
            
            def update_status():
                """Update status display."""
                status = self.agent.get_status()
                voice_settings = self.agent.config.get("voice_settings", {})
                return f"""
                <div class="status-card" style="padding: 15px; border-radius: 8px;">
                    <h4>✅ Voice System Status</h4>
                    <p><strong>Agent:</strong> {status['name']}</p>
                    <p><strong>Status:</strong> {status['status']}</p>
                    <p><strong>Whisper:</strong> {voice_settings.get('whisper_model', 'whisper-1')}</p>
                    <p><strong>ElevenLabs:</strong> Active</p>
                    <p><strong>Languages:</strong> 5+ supported</p>
                    <p><strong>Security:</strong> {'🛡️ Enabled' if status['security_enabled'] else '❌ Disabled'}</p>
                </div>
                """
            
            # Connect events
            send_btn.click(
                user,
                inputs=[msg_input, chatbot],
                outputs=[chatbot, msg_input]
            ).then(
                bot_response,
                inputs=[chatbot, msg_input],
                outputs=[chatbot]
            )
            
            msg_input.submit(
                user,
                inputs=[msg_input, chatbot],
                outputs=[chatbot, msg_input]
            ).then(
                bot_response,
                inputs=[chatbot, msg_input],
                outputs=[chatbot]
            )
            
            # Audio processing
            audio_input.change(
                process_audio,
                inputs=[audio_input],
                outputs=[audio_output, chatbot]
            )
            
            # Text-to-speech generation
            def generate_speech(text, voice, speed):
                return text_to_speech(text, voice, speed)
            
            clear_btn.click(clear_conversation, outputs=chatbot)
            
            # Initial status update
            app.load(update_status, outputs=status_display)
        
        return app
    
    def launch(self, **kwargs):
        """Launch the Gradio application."""
        self.interface.launch(
            server_name="0.0.0.0",
            server_port=7863,
            share=False,
            show_error=True,
            quiet=False,
            **kwargs
        )


# Example usage and quick commands
EXAMPLE_QUERIES = [
    "Transcribe this audio file",
    "Say 'Hello, welcome to our voice AI' in a female voice",
    "Start a voice conversation",
    "Analyze the sentiment of this audio",
    "Search for meeting recordings about project updates",
    "Enable multilingual voice mode"
]


def main():
    """Main function to run the Voice Agent app."""
    print("🎤 Starting Voice Agent...")
    print("🎙️ Initializing Whisper (Speech-to-Text)...")
    print("🗣️ Loading ElevenLabs (Text-to-Speech)...")
    print("🧠 Connecting AI models (GPT-4o, Gemini)...")
    print("🌍 Setting up multilingual support...")
    
    app = VoiceApp()
    
    print("\n" + "="*60)
    print("🎤 VOICE AGENT - SPEECH PROCESSING SUITE")
    print("="*60)
    print("\n💡 Example voice requests you can try:")
    for i, query in enumerate(EXAMPLE_QUERIES, 1):
        print(f"   {i}. {query}")
    print("\n🎙️ Features:")
    print("   • Record your voice or upload audio files")
    print("   • Convert text to natural-sounding speech")
    print("   • Full voice conversations with AI")
    print("   • Multi-language support (English, Spanish, Nepali, etc.)")
    print("\n🌐 Starting Gradio server...")
    print("🔗 Open your browser to: http://localhost:7863")
    print("\n" + "="*60)
    
    app.launch()


if __name__ == "__main__":
    main()