Nikita Makarov commited on
Commit
11a8acd
·
1 Parent(s): 14b0c7b

Fix voice recording: use Gradio Audio component for browser-based recording

Browse files
Files changed (1) hide show
  1. src/app.py +61 -14
src/app.py CHANGED
@@ -996,18 +996,59 @@ def split_text_into_segments(text: str, max_sentences: int = 2) -> List[str]:
996
 
997
  return segments
998
 
999
- def handle_voice_request():
1000
- """Handle voice input request for song"""
1001
- # Check if voice input is available
1002
- if not voice_input_service.available:
1003
- return "⚠️ Voice input is not available. Please install PortAudio and pyaudio.\n\nSee INSTALL_VOICE_INPUT.md for instructions.\n\nYou can still request songs by typing in preferences!", None, ""
1004
 
1005
  try:
1006
- # Listen and recognize
1007
- recognized_text = voice_input_service.listen_and_recognize(timeout=5, phrase_time_limit=10)
 
1008
 
1009
- if not recognized_text:
1010
- return "❌ Could not recognize speech. Please try again.", None, ""
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1011
 
1012
  # Process the request
1013
  song_request = voice_input_service.process_song_request(recognized_text)
@@ -1017,7 +1058,7 @@ def handle_voice_request():
1017
  tracks = agent.music_server.search_by_request(song_request)
1018
 
1019
  if not tracks:
1020
- return f"❌ Could not find music for: '{recognized_text}'. Try saying something like 'play pop music' or 'play a song by [artist name]'!", None, ""
1021
 
1022
  # Get the first matching track
1023
  track = tracks[0]
@@ -1633,8 +1674,14 @@ with gr.Blocks(css=custom_css, title="AI Radio 🎵", theme=gr.themes.Soft(), he
1633
  stop_btn = gr.Button("⏹️ Stop", variant="stop", size="lg", elem_classes="control-button")
1634
 
1635
  with gr.Row():
1636
- voice_btn = gr.Button("🎤 Ask for a Song", variant="primary", size="lg", elem_classes="control-button")
1637
- voice_status = gr.Textbox(label="Voice Request", value="Click to request a song by voice", interactive=False)
 
 
 
 
 
 
1638
 
1639
  # Like/Dislike buttons for current track
1640
  with gr.Row():
@@ -1690,10 +1737,10 @@ with gr.Blocks(css=custom_css, title="AI Radio 🎵", theme=gr.themes.Soft(), he
1690
  js="() => { if(window.cancelNextSegment) window.cancelNextSegment(); }"
1691
  )
1692
 
1693
- # Voice input button - direct click without .then() chain
1694
  voice_btn.click(
1695
  fn=handle_voice_request,
1696
- inputs=[],
1697
  outputs=[voice_status, audio_output, music_player, player_timer]
1698
  )
1699
 
 
996
 
997
  return segments
998
 
999
+ def handle_voice_request(audio_file):
1000
+ """Handle voice input request for song from uploaded audio file"""
1001
+ if not audio_file:
1002
+ return "⚠️ Please record your voice request first!", None, "", gr.Timer(value=0, active=False)
 
1003
 
1004
  try:
1005
+ # Use speech_recognition to process the audio file
1006
+ import speech_recognition as sr
1007
+ from pydub import AudioSegment
1008
 
1009
+ recognizer = sr.Recognizer()
1010
+
1011
+ # Convert audio to WAV format if needed (speech_recognition requires WAV)
1012
+ audio_path = audio_file
1013
+ if isinstance(audio_file, tuple):
1014
+ # Gradio Audio returns (sample_rate, audio_data) or filepath
1015
+ audio_path = audio_file[1] if len(audio_file) > 1 else audio_file[0]
1016
+
1017
+ # If it's not a WAV file, convert it
1018
+ if audio_path and not audio_path.endswith('.wav'):
1019
+ try:
1020
+ # Load and convert to WAV
1021
+ audio = AudioSegment.from_file(audio_path)
1022
+ wav_path = audio_path.rsplit('.', 1)[0] + '.wav'
1023
+ audio.export(wav_path, format="wav")
1024
+ audio_path = wav_path
1025
+ except Exception as e:
1026
+ print(f"⚠️ Could not convert audio: {e}, trying original file")
1027
+
1028
+ # Load audio file for recognition
1029
+ try:
1030
+ with sr.AudioFile(audio_path) as source:
1031
+ audio = recognizer.record(source)
1032
+ except Exception as e:
1033
+ # Try with pydub conversion first
1034
+ try:
1035
+ audio_seg = AudioSegment.from_file(audio_path)
1036
+ wav_temp = os.path.join(AUDIO_DIR, f"temp_voice_{int(time.time())}.wav")
1037
+ audio_seg.export(wav_temp, format="wav")
1038
+ with sr.AudioFile(wav_temp) as source:
1039
+ audio = recognizer.record(source)
1040
+ audio_path = wav_temp
1041
+ except Exception as conv_e:
1042
+ return f"❌ Could not process audio file: {conv_e}. Please try recording again.", None, "", gr.Timer(value=0, active=False)
1043
+
1044
+ # Recognize speech using Google's API
1045
+ try:
1046
+ recognized_text = recognizer.recognize_google(audio)
1047
+ print(f"🎤 Recognized: {recognized_text}")
1048
+ except sr.UnknownValueError:
1049
+ return "❌ Could not understand audio. Please speak clearly and try again.", None, "", gr.Timer(value=0, active=False)
1050
+ except sr.RequestError as e:
1051
+ return f"❌ Error with speech recognition service: {e}. Please try again.", None, "", gr.Timer(value=0, active=False)
1052
 
1053
  # Process the request
1054
  song_request = voice_input_service.process_song_request(recognized_text)
 
1058
  tracks = agent.music_server.search_by_request(song_request)
1059
 
1060
  if not tracks:
1061
+ return f"❌ Could not find music for: '{recognized_text}'. Try saying something like 'play pop music' or 'play a song by [artist name]'!", None, "", gr.Timer(value=0, active=False)
1062
 
1063
  # Get the first matching track
1064
  track = tracks[0]
 
1674
  stop_btn = gr.Button("⏹️ Stop", variant="stop", size="lg", elem_classes="control-button")
1675
 
1676
  with gr.Row():
1677
+ voice_audio = gr.Audio(
1678
+ label="🎤 Record Your Song Request",
1679
+ type="filepath",
1680
+ sources=["microphone"],
1681
+ format="wav"
1682
+ )
1683
+ voice_btn = gr.Button("🎤 Process Voice Request", variant="primary", size="lg", elem_classes="control-button")
1684
+ voice_status = gr.Textbox(label="Voice Request Status", value="Record your voice request above, then click the button", interactive=False)
1685
 
1686
  # Like/Dislike buttons for current track
1687
  with gr.Row():
 
1737
  js="() => { if(window.cancelNextSegment) window.cancelNextSegment(); }"
1738
  )
1739
 
1740
+ # Voice input button - process recorded audio
1741
  voice_btn.click(
1742
  fn=handle_voice_request,
1743
+ inputs=[voice_audio],
1744
  outputs=[voice_status, audio_output, music_player, player_timer]
1745
  )
1746