Spaces:

eduard76
/

easy_voice

Sleeping

App Files Files Community

eduard76 commited on Oct 3, 2025

Commit

19c6da1

verified ·

1 Parent(s): b639322

Update app.py

Browse files

Files changed (1) hide show

app.py +0 -389

app.py CHANGED Viewed

@@ -6,14 +6,6 @@ import tempfile
 import numpy as np
 from openai import OpenAI
-class RealtimeVoiceAgenimport gradio as gr
-import openai
-import os
-from pathlib import Path
-import tempfile
-import numpy as np
-from openai import OpenAI
 class RealtimeVoiceAgent:
     def __init__(self, api_key=None):
         """Initialize the voice agent with OpenAI"""
@@ -454,384 +446,3 @@ if __name__ == "__main__":
         share=False,
         show_error=True
     )
-:
-    def __init__(self, api_key=None):
-        """Initialize the voice agent with OpenAI"""
-        self.api_key = api_key or os.getenv("OPENAI_API_KEY")
-        if not self.api_key:
-            raise ValueError("OpenAI API key not found. Set OPENAI_API_KEY environment variable.")
-        self.client = OpenAI(api_key=self.api_key)
-        self.conversation_history = []
-        self.voice = "alloy"  # Default voice
-    def transcribe_audio(self, audio_path):
-        """Convert speech to text using OpenAI Whisper API"""
-        try:
-            # Debug: Check if file exists
-            if not os.path.exists(audio_path):
-                raise Exception(f"Audio file not found at path: {audio_path}")
-            # Debug: Check file size
-            file_size = os.path.getsize(audio_path)
-            if file_size == 0:
-                raise Exception("Audio file is empty (0 bytes)")
-            print(f"[DEBUG] Transcribing audio: {audio_path} ({file_size} bytes)")
-            with open(audio_path, "rb") as audio_file:
-                transcript = self.client.audio.transcriptions.create(
-                    model="whisper-1",
-                    file=audio_file,
-                    language="en"
-                )
-            print(f"[DEBUG] Transcription successful: {transcript.text[:50]}...")
-            return transcript.text
-        except FileNotFoundError as e:
-            raise Exception(f"Audio file not found: {str(e)}")
-        except Exception as e:
-            raise Exception(f"Transcription failed: {type(e).__name__} - {str(e)}")
-    def get_llm_response(self, user_message):
-        """Get streaming response from OpenAI GPT"""
-        try:
-            # Add user message to history
-            self.conversation_history.append({
-                "role": "user",
-                "content": user_message
-            })
-            # Get streaming response
-            response = self.client.chat.completions.create(
-                model="gpt-4o-mini",  # Fast and cost-effective
-                messages=[
-                    {"role": "system", "content": "You are a helpful, friendly voice assistant. Keep responses concise and natural for voice conversation (2-3 sentences max)."},
-                    *self.conversation_history
-                ],
-                max_tokens=150,
-                temperature=0.7,
-                stream=True
-            )
-            # Collect full response
-            full_response = ""
-            for chunk in response:
-                if chunk.choices[0].delta.content:
-                    full_response += chunk.choices[0].delta.content
-            # Add assistant response to history
-            self.conversation_history.append({
-                "role": "assistant",
-                "content": full_response
-            })
-            return full_response
-        except Exception as e:
-            raise Exception(f"LLM response failed: {str(e)}")
-    def synthesize_speech(self, text):
-        """Convert text to speech using OpenAI TTS"""
-        try:
-            response = self.client.audio.speech.create(
-                model="tts-1",  # Fast model (tts-1-hd for higher quality)
-                voice=self.voice,  # Options: alloy, echo, fable, onyx, nova, shimmer
-                input=text,
-                speed=1.0
-            )
-            # Save to temporary file with proper handling for Gradio
-            temp_dir = tempfile.gettempdir()
-            output_path = os.path.join(temp_dir, f"tts_output_{os.getpid()}_{hash(text) % 10000}.mp3")
-            with open(output_path, "wb") as f:
-                f.write(response.content)
-            return output_path
-        except Exception as e:
-            raise Exception(f"Speech synthesis failed: {str(e)}")
-    def process_voice_input(self, audio_input, progress=gr.Progress()):
-        """Full pipeline: Voice → Text → LLM → Voice"""
-        if audio_input is None:
-            return None, "⚠️ No audio detected. Please record your voice.", None, self._format_history()
-        try:
-            # Step 1: Speech to Text
-            progress(0.2, desc="🎧 Transcribing your voice...")
-            user_text = self.transcribe_audio(audio_input)
-            if not user_text.strip():
-                return None, "⚠️ Could not understand audio. Please speak clearly.", None, self._format_history()
-            # Step 2: Get LLM Response
-            progress(0.5, desc="🤔 Thinking...")
-            assistant_text = self.get_llm_response(user_text)
-            # Step 3: Text to Speech
-            progress(0.8, desc="🔊 Generating voice response...")
-            audio_output = self.synthesize_speech(assistant_text)
-            # Format status
-            status = f"**You:** {user_text}\n\n**Assistant:** {assistant_text}"
-            # Format conversation history
-            chat_history = self._format_history()
-            progress(1.0, desc="✓ Done!")
-            return audio_output, status, None, chat_history
-        except Exception as e:
-            error_msg = f"❌ Error: {str(e)}\n\nPlease check your API key and try again."
-            return None, error_msg, None, self._format_history()
-    def _format_history(self):
-        """Format conversation history for chatbot display"""
-        formatted = []
-        for i in range(0, len(self.conversation_history), 2):
-            if i + 1 < len(self.conversation_history):
-                formatted.append((
-                    self.conversation_history[i]["content"],
-                    self.conversation_history[i + 1]["content"]
-                ))
-        return formatted
-    def clear_conversation(self):
-        """Clear conversation history"""
-        self.conversation_history = []
-        return None, "Conversation cleared!", None, []
-    def change_voice(self, voice_name):
-        """Change TTS voice"""
-        self.voice = voice_name
-        return f"✓ Voice changed to: **{voice_name}**"
-# Initialize agent (will use environment variable)
-agent = None
-def initialize_agent():
-    """Initialize agent with API key check"""
-    global agent
-    api_key = os.getenv("OPENAI_API_KEY")
-    if not api_key:
-        return "❌ OpenAI API key not found!\n\nPlease set it in Hugging Face Space settings:\nSettings → Repository secrets → New secret\nName: OPENAI_API_KEY\nValue: your-api-key"
-    try:
-        agent = RealtimeVoiceAgent(api_key=api_key)
-        return "✅ Voice Agent initialized successfully!\n\n🎤 You can now start talking!"
-    except Exception as e:
-        return f"❌ Initialization failed: {str(e)}"
-def process_audio_wrapper(audio, progress=gr.Progress()):
-    """Wrapper to check if agent is initialized"""
-    if agent is None:
-        return None, "⚠️ Please initialize the agent first!", None, []
-    return agent.process_voice_input(audio, progress)
-def clear_wrapper():
-    """Wrapper for clear function"""
-    if agent is None:
-        return None, "⚠️ Please initialize the agent first!", None, []
-    return agent.clear_conversation()
-def change_voice_wrapper(voice_name):
-    """Wrapper for voice change function"""
-    if agent is None:
-        return "⚠️ Please initialize the agent first!"
-    return agent.change_voice(voice_name)
-# Create Gradio Interface
-with gr.Blocks(
-    title="🎙️ Real-Time Voice Agent",
-    theme=gr.themes.Soft(primary_hue="blue", secondary_hue="purple"),
-    css="""
-        .main-header {text-align: center; padding: 30px; background: linear-gradient(135deg, #667eea 0%, #764ba2 100%); color: white; border-radius: 10px; margin-bottom: 20px;}
-        .status-box {background: #f8f9fa; padding: 15px; border-radius: 8px; border-left: 4px solid #667eea;}
-        .warning-box {background: #fff3cd; padding: 15px; border-radius: 8px; border-left: 4px solid #ffc107;}
-    """
-) as demo:
-    gr.Markdown("""
-    <div class="main-header">
-        <h1>🎙️ Real-Time Voice Agent</h1>
-        <p>State-of-the-art voice conversation powered by OpenAI</p>
-        <p><em>Whisper + GPT-4o-mini + TTS</em></p>
-    </div>
-    """)
-    with gr.Row():
-        with gr.Column(scale=1):
-            gr.Markdown("""
-            ### 🚀 Quick Start
-            1. **Initialize** the agent below
-            2. **Click** the microphone 🎤
-            3. **Speak** your question
-            4. **Listen** to the AI response
-            ---
-            ### ⚙️ Settings
-            """)
-            init_button = gr.Button(
-                "🤖 Initialize Voice Agent",
-                variant="primary",
-                size="lg"
-            )
-            init_status = gr.Markdown(
-                '<div class="warning-box">⚠️ Click "Initialize Voice Agent" to start</div>'
-            )
-            gr.Markdown("---")
-            voice_selector = gr.Dropdown(
-                choices=["alloy", "echo", "fable", "onyx", "nova", "shimmer"],
-                value="alloy",
-                label="🎵 AI Voice Style",
-                info="Select the voice for AI responses"
-            )
-            voice_status = gr.Markdown("")
-            gr.Markdown("""
-            ---
-            ### 💡 Tips
-            - 🎯 Speak clearly and naturally
-            - ⏱️ Keep messages under 20 seconds
-            - 🔇 Minimize background noise
-            - 🌐 Use Chrome for best compatibility
-            ### 🎤 Voice Styles
-            - **Alloy**: Neutral, balanced
-            - **Echo**: Male, clear
-            - **Fable**: British, expressive
-            - **Onyx**: Deep, authoritative
-            - **Nova**: Female, friendly
-            - **Shimmer**: Warm, engaging
-            """)
-        with gr.Column(scale=2):
-            gr.Markdown("## 🎤 Voice Conversation")
-            audio_input = gr.Audio(
-                sources=["microphone", "upload"],
-                type="filepath",
-                label="🎤 Click to Record Your Voice"
-            )
-            process_status = gr.Markdown(
-                '<div class="status-box">**Status:** Ready to listen...</div>',
-                elem_classes=["status-box"]
-            )
-            audio_output = gr.Audio(
-                label="🔊 AI Voice Response",
-                type="filepath",
-                autoplay=True
-            )
-            with gr.Row():
-                process_btn = gr.Button(
-                    "💬 Process Voice",
-                    variant="secondary",
-                    size="lg",
-                    scale=3
-                )
-                clear_btn = gr.Button(
-                    "🗑️ Clear History",
-                    variant="stop",
-                    scale=1
-                )
-            gr.Markdown("---")
-            gr.Markdown("## 💭 Conversation History")
-            conversation_display = gr.Chatbot(
-                label="Your Conversation",
-                height=400,
-                bubble_full_width=False,
-                avatar_images=(None, "🤖")
-            )
-    gr.Markdown("""
-    ---
-    ### 📊 Technical Stack
-    - **Speech Recognition**: OpenAI Whisper (99%+ accuracy)
-    - **Language Model**: GPT-4o-mini (fast, intelligent)
-    - **Speech Synthesis**: OpenAI TTS (natural, expressive)
-    - **Interface**: Gradio (real-time updates)
-    ### 🔐 Privacy & Costs
-    - Requires OpenAI API key (set in Space settings)
-    - Approximate cost: $0.01-0.03 per conversation
-    - Audio is processed through OpenAI's API
-    - No data is stored permanently
-    ### 🐛 Troubleshooting
-    - **No audio?** Check browser microphone permissions
-    - **API error?** Verify your OpenAI API key in Space settings
-    - **Slow response?** Try shorter messages or upgrade to paid OpenAI plan
-    ---
-    <div style="text-align: center; color: #666;">
-        Built with ❤️ using OpenAI APIs |
-        <a href="https://github.com/openai/whisper">Whisper</a> |
-        <a href="https://platform.openai.com/docs/guides/text-to-speech">TTS</a> |
-        <a href="https://platform.openai.com/docs/guides/chat">GPT-4</a>
-    </div>
-    """)
-    # Event handlers
-    init_button.click(
-        fn=initialize_agent,
-        outputs=[init_status]
-    )
-    process_btn.click(
-        fn=process_audio_wrapper,
-        inputs=[audio_input],
-        outputs=[audio_output, process_status, audio_input, conversation_display]
-    )
-    # Auto-process when recording stops
-    audio_input.stop_recording(
-        fn=process_audio_wrapper,
-        inputs=[audio_input],
-        outputs=[audio_output, process_status, audio_input, conversation_display]
-    )
-    clear_btn.click(
-        fn=clear_wrapper,
-        outputs=[audio_output, process_status, audio_input, conversation_display]
-    )
-    voice_selector.change(
-        fn=change_voice_wrapper,
-        inputs=[voice_selector],
-        outputs=[voice_status]
-    )
-if __name__ == "__main__":
-    demo.launch(
-        server_name="0.0.0.0",
-        share=False,
-        show_error=True
-    )

 import numpy as np
 from openai import OpenAI
 class RealtimeVoiceAgent:
     def __init__(self, api_key=None):
         """Initialize the voice agent with OpenAI"""
         share=False,
         show_error=True
     )