eduard76 commited on
Commit
19c6da1
·
verified ·
1 Parent(s): b639322

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +0 -389
app.py CHANGED
@@ -6,14 +6,6 @@ import tempfile
6
  import numpy as np
7
  from openai import OpenAI
8
 
9
- class RealtimeVoiceAgenimport gradio as gr
10
- import openai
11
- import os
12
- from pathlib import Path
13
- import tempfile
14
- import numpy as np
15
- from openai import OpenAI
16
-
17
  class RealtimeVoiceAgent:
18
  def __init__(self, api_key=None):
19
  """Initialize the voice agent with OpenAI"""
@@ -454,384 +446,3 @@ if __name__ == "__main__":
454
  share=False,
455
  show_error=True
456
  )
457
- :
458
- def __init__(self, api_key=None):
459
- """Initialize the voice agent with OpenAI"""
460
- self.api_key = api_key or os.getenv("OPENAI_API_KEY")
461
- if not self.api_key:
462
- raise ValueError("OpenAI API key not found. Set OPENAI_API_KEY environment variable.")
463
-
464
- self.client = OpenAI(api_key=self.api_key)
465
- self.conversation_history = []
466
- self.voice = "alloy" # Default voice
467
-
468
- def transcribe_audio(self, audio_path):
469
- """Convert speech to text using OpenAI Whisper API"""
470
- try:
471
- # Debug: Check if file exists
472
- if not os.path.exists(audio_path):
473
- raise Exception(f"Audio file not found at path: {audio_path}")
474
-
475
- # Debug: Check file size
476
- file_size = os.path.getsize(audio_path)
477
- if file_size == 0:
478
- raise Exception("Audio file is empty (0 bytes)")
479
-
480
- print(f"[DEBUG] Transcribing audio: {audio_path} ({file_size} bytes)")
481
-
482
- with open(audio_path, "rb") as audio_file:
483
- transcript = self.client.audio.transcriptions.create(
484
- model="whisper-1",
485
- file=audio_file,
486
- language="en"
487
- )
488
-
489
- print(f"[DEBUG] Transcription successful: {transcript.text[:50]}...")
490
- return transcript.text
491
-
492
- except FileNotFoundError as e:
493
- raise Exception(f"Audio file not found: {str(e)}")
494
- except Exception as e:
495
- raise Exception(f"Transcription failed: {type(e).__name__} - {str(e)}")
496
-
497
- def get_llm_response(self, user_message):
498
- """Get streaming response from OpenAI GPT"""
499
- try:
500
- # Add user message to history
501
- self.conversation_history.append({
502
- "role": "user",
503
- "content": user_message
504
- })
505
-
506
- # Get streaming response
507
- response = self.client.chat.completions.create(
508
- model="gpt-4o-mini", # Fast and cost-effective
509
- messages=[
510
- {"role": "system", "content": "You are a helpful, friendly voice assistant. Keep responses concise and natural for voice conversation (2-3 sentences max)."},
511
- *self.conversation_history
512
- ],
513
- max_tokens=150,
514
- temperature=0.7,
515
- stream=True
516
- )
517
-
518
- # Collect full response
519
- full_response = ""
520
- for chunk in response:
521
- if chunk.choices[0].delta.content:
522
- full_response += chunk.choices[0].delta.content
523
-
524
- # Add assistant response to history
525
- self.conversation_history.append({
526
- "role": "assistant",
527
- "content": full_response
528
- })
529
-
530
- return full_response
531
-
532
- except Exception as e:
533
- raise Exception(f"LLM response failed: {str(e)}")
534
-
535
- def synthesize_speech(self, text):
536
- """Convert text to speech using OpenAI TTS"""
537
- try:
538
- response = self.client.audio.speech.create(
539
- model="tts-1", # Fast model (tts-1-hd for higher quality)
540
- voice=self.voice, # Options: alloy, echo, fable, onyx, nova, shimmer
541
- input=text,
542
- speed=1.0
543
- )
544
-
545
- # Save to temporary file with proper handling for Gradio
546
- temp_dir = tempfile.gettempdir()
547
- output_path = os.path.join(temp_dir, f"tts_output_{os.getpid()}_{hash(text) % 10000}.mp3")
548
-
549
- with open(output_path, "wb") as f:
550
- f.write(response.content)
551
-
552
- return output_path
553
-
554
- except Exception as e:
555
- raise Exception(f"Speech synthesis failed: {str(e)}")
556
-
557
- def process_voice_input(self, audio_input, progress=gr.Progress()):
558
- """Full pipeline: Voice → Text → LLM → Voice"""
559
-
560
- if audio_input is None:
561
- return None, "⚠️ No audio detected. Please record your voice.", None, self._format_history()
562
-
563
- try:
564
- # Step 1: Speech to Text
565
- progress(0.2, desc="🎧 Transcribing your voice...")
566
- user_text = self.transcribe_audio(audio_input)
567
-
568
- if not user_text.strip():
569
- return None, "⚠️ Could not understand audio. Please speak clearly.", None, self._format_history()
570
-
571
- # Step 2: Get LLM Response
572
- progress(0.5, desc="🤔 Thinking...")
573
- assistant_text = self.get_llm_response(user_text)
574
-
575
- # Step 3: Text to Speech
576
- progress(0.8, desc="🔊 Generating voice response...")
577
- audio_output = self.synthesize_speech(assistant_text)
578
-
579
- # Format status
580
- status = f"**You:** {user_text}\n\n**Assistant:** {assistant_text}"
581
-
582
- # Format conversation history
583
- chat_history = self._format_history()
584
-
585
- progress(1.0, desc="✓ Done!")
586
-
587
- return audio_output, status, None, chat_history
588
-
589
- except Exception as e:
590
- error_msg = f"❌ Error: {str(e)}\n\nPlease check your API key and try again."
591
- return None, error_msg, None, self._format_history()
592
-
593
- def _format_history(self):
594
- """Format conversation history for chatbot display"""
595
- formatted = []
596
- for i in range(0, len(self.conversation_history), 2):
597
- if i + 1 < len(self.conversation_history):
598
- formatted.append((
599
- self.conversation_history[i]["content"],
600
- self.conversation_history[i + 1]["content"]
601
- ))
602
- return formatted
603
-
604
- def clear_conversation(self):
605
- """Clear conversation history"""
606
- self.conversation_history = []
607
- return None, "Conversation cleared!", None, []
608
-
609
- def change_voice(self, voice_name):
610
- """Change TTS voice"""
611
- self.voice = voice_name
612
- return f"✓ Voice changed to: **{voice_name}**"
613
-
614
-
615
- # Initialize agent (will use environment variable)
616
- agent = None
617
-
618
- def initialize_agent():
619
- """Initialize agent with API key check"""
620
- global agent
621
- api_key = os.getenv("OPENAI_API_KEY")
622
-
623
- if not api_key:
624
- return "❌ OpenAI API key not found!\n\nPlease set it in Hugging Face Space settings:\nSettings → Repository secrets → New secret\nName: OPENAI_API_KEY\nValue: your-api-key"
625
-
626
- try:
627
- agent = RealtimeVoiceAgent(api_key=api_key)
628
- return "✅ Voice Agent initialized successfully!\n\n🎤 You can now start talking!"
629
- except Exception as e:
630
- return f"❌ Initialization failed: {str(e)}"
631
-
632
- def process_audio_wrapper(audio, progress=gr.Progress()):
633
- """Wrapper to check if agent is initialized"""
634
- if agent is None:
635
- return None, "⚠️ Please initialize the agent first!", None, []
636
- return agent.process_voice_input(audio, progress)
637
-
638
- def clear_wrapper():
639
- """Wrapper for clear function"""
640
- if agent is None:
641
- return None, "⚠️ Please initialize the agent first!", None, []
642
- return agent.clear_conversation()
643
-
644
- def change_voice_wrapper(voice_name):
645
- """Wrapper for voice change function"""
646
- if agent is None:
647
- return "⚠️ Please initialize the agent first!"
648
- return agent.change_voice(voice_name)
649
-
650
-
651
- # Create Gradio Interface
652
- with gr.Blocks(
653
- title="🎙️ Real-Time Voice Agent",
654
- theme=gr.themes.Soft(primary_hue="blue", secondary_hue="purple"),
655
- css="""
656
- .main-header {text-align: center; padding: 30px; background: linear-gradient(135deg, #667eea 0%, #764ba2 100%); color: white; border-radius: 10px; margin-bottom: 20px;}
657
- .status-box {background: #f8f9fa; padding: 15px; border-radius: 8px; border-left: 4px solid #667eea;}
658
- .warning-box {background: #fff3cd; padding: 15px; border-radius: 8px; border-left: 4px solid #ffc107;}
659
- """
660
- ) as demo:
661
-
662
- gr.Markdown("""
663
- <div class="main-header">
664
- <h1>🎙️ Real-Time Voice Agent</h1>
665
- <p>State-of-the-art voice conversation powered by OpenAI</p>
666
- <p><em>Whisper + GPT-4o-mini + TTS</em></p>
667
- </div>
668
- """)
669
-
670
- with gr.Row():
671
- with gr.Column(scale=1):
672
- gr.Markdown("""
673
- ### 🚀 Quick Start
674
-
675
- 1. **Initialize** the agent below
676
- 2. **Click** the microphone 🎤
677
- 3. **Speak** your question
678
- 4. **Listen** to the AI response
679
-
680
- ---
681
-
682
- ### ⚙️ Settings
683
- """)
684
-
685
- init_button = gr.Button(
686
- "🤖 Initialize Voice Agent",
687
- variant="primary",
688
- size="lg"
689
- )
690
-
691
- init_status = gr.Markdown(
692
- '<div class="warning-box">⚠️ Click "Initialize Voice Agent" to start</div>'
693
- )
694
-
695
- gr.Markdown("---")
696
-
697
- voice_selector = gr.Dropdown(
698
- choices=["alloy", "echo", "fable", "onyx", "nova", "shimmer"],
699
- value="alloy",
700
- label="🎵 AI Voice Style",
701
- info="Select the voice for AI responses"
702
- )
703
-
704
- voice_status = gr.Markdown("")
705
-
706
- gr.Markdown("""
707
- ---
708
-
709
- ### 💡 Tips
710
-
711
- - 🎯 Speak clearly and naturally
712
- - ⏱️ Keep messages under 20 seconds
713
- - 🔇 Minimize background noise
714
- - 🌐 Use Chrome for best compatibility
715
-
716
- ### 🎤 Voice Styles
717
-
718
- - **Alloy**: Neutral, balanced
719
- - **Echo**: Male, clear
720
- - **Fable**: British, expressive
721
- - **Onyx**: Deep, authoritative
722
- - **Nova**: Female, friendly
723
- - **Shimmer**: Warm, engaging
724
- """)
725
-
726
- with gr.Column(scale=2):
727
- gr.Markdown("## 🎤 Voice Conversation")
728
-
729
- audio_input = gr.Audio(
730
- sources=["microphone", "upload"],
731
- type="filepath",
732
- label="🎤 Click to Record Your Voice"
733
- )
734
-
735
- process_status = gr.Markdown(
736
- '<div class="status-box">**Status:** Ready to listen...</div>',
737
- elem_classes=["status-box"]
738
- )
739
-
740
- audio_output = gr.Audio(
741
- label="🔊 AI Voice Response",
742
- type="filepath",
743
- autoplay=True
744
- )
745
-
746
- with gr.Row():
747
- process_btn = gr.Button(
748
- "💬 Process Voice",
749
- variant="secondary",
750
- size="lg",
751
- scale=3
752
- )
753
- clear_btn = gr.Button(
754
- "🗑️ Clear History",
755
- variant="stop",
756
- scale=1
757
- )
758
-
759
- gr.Markdown("---")
760
- gr.Markdown("## 💭 Conversation History")
761
-
762
- conversation_display = gr.Chatbot(
763
- label="Your Conversation",
764
- height=400,
765
- bubble_full_width=False,
766
- avatar_images=(None, "🤖")
767
- )
768
-
769
- gr.Markdown("""
770
- ---
771
-
772
- ### 📊 Technical Stack
773
-
774
- - **Speech Recognition**: OpenAI Whisper (99%+ accuracy)
775
- - **Language Model**: GPT-4o-mini (fast, intelligent)
776
- - **Speech Synthesis**: OpenAI TTS (natural, expressive)
777
- - **Interface**: Gradio (real-time updates)
778
-
779
- ### 🔐 Privacy & Costs
780
-
781
- - Requires OpenAI API key (set in Space settings)
782
- - Approximate cost: $0.01-0.03 per conversation
783
- - Audio is processed through OpenAI's API
784
- - No data is stored permanently
785
-
786
- ### 🐛 Troubleshooting
787
-
788
- - **No audio?** Check browser microphone permissions
789
- - **API error?** Verify your OpenAI API key in Space settings
790
- - **Slow response?** Try shorter messages or upgrade to paid OpenAI plan
791
-
792
- ---
793
-
794
- <div style="text-align: center; color: #666;">
795
- Built with ❤️ using OpenAI APIs |
796
- <a href="https://github.com/openai/whisper">Whisper</a> |
797
- <a href="https://platform.openai.com/docs/guides/text-to-speech">TTS</a> |
798
- <a href="https://platform.openai.com/docs/guides/chat">GPT-4</a>
799
- </div>
800
- """)
801
-
802
- # Event handlers
803
- init_button.click(
804
- fn=initialize_agent,
805
- outputs=[init_status]
806
- )
807
-
808
- process_btn.click(
809
- fn=process_audio_wrapper,
810
- inputs=[audio_input],
811
- outputs=[audio_output, process_status, audio_input, conversation_display]
812
- )
813
-
814
- # Auto-process when recording stops
815
- audio_input.stop_recording(
816
- fn=process_audio_wrapper,
817
- inputs=[audio_input],
818
- outputs=[audio_output, process_status, audio_input, conversation_display]
819
- )
820
-
821
- clear_btn.click(
822
- fn=clear_wrapper,
823
- outputs=[audio_output, process_status, audio_input, conversation_display]
824
- )
825
-
826
- voice_selector.change(
827
- fn=change_voice_wrapper,
828
- inputs=[voice_selector],
829
- outputs=[voice_status]
830
- )
831
-
832
- if __name__ == "__main__":
833
- demo.launch(
834
- server_name="0.0.0.0",
835
- share=False,
836
- show_error=True
837
- )
 
6
  import numpy as np
7
  from openai import OpenAI
8
 
 
 
 
 
 
 
 
 
9
  class RealtimeVoiceAgent:
10
  def __init__(self, api_key=None):
11
  """Initialize the voice agent with OpenAI"""
 
446
  share=False,
447
  show_error=True
448
  )