Spaces:

MCP-1st-Birthday
/

AI-RADIO

Sleeping

App Files Files Community

Nikita Makarov commited on Nov 30, 2025

Commit

11a8acd

1 Parent(s): 14b0c7b

Fix voice recording: use Gradio Audio component for browser-based recording

Browse files

Files changed (1) hide show

src/app.py +61 -14

src/app.py CHANGED Viewed

@@ -996,18 +996,59 @@ def split_text_into_segments(text: str, max_sentences: int = 2) -> List[str]:
     return segments
-def handle_voice_request():
-    """Handle voice input request for song"""
-    # Check if voice input is available
-    if not voice_input_service.available:
-        return "⚠️ Voice input is not available. Please install PortAudio and pyaudio.\n\nSee INSTALL_VOICE_INPUT.md for instructions.\n\nYou can still request songs by typing in preferences!", None, ""
     try:
-        # Listen and recognize
-        recognized_text = voice_input_service.listen_and_recognize(timeout=5, phrase_time_limit=10)
-        if not recognized_text:
-            return "❌ Could not recognize speech. Please try again.", None, ""
         # Process the request
         song_request = voice_input_service.process_song_request(recognized_text)
@@ -1017,7 +1058,7 @@ def handle_voice_request():
         tracks = agent.music_server.search_by_request(song_request)
         if not tracks:
-            return f"❌ Could not find music for: '{recognized_text}'. Try saying something like 'play pop music' or 'play a song by [artist name]'!", None, ""
         # Get the first matching track
         track = tracks[0]
@@ -1633,8 +1674,14 @@ with gr.Blocks(css=custom_css, title="AI Radio 🎵", theme=gr.themes.Soft(), he
                 stop_btn = gr.Button("⏹️ Stop", variant="stop", size="lg", elem_classes="control-button")
             with gr.Row():
-                voice_btn = gr.Button("🎤 Ask for a Song", variant="primary", size="lg", elem_classes="control-button")
-                voice_status = gr.Textbox(label="Voice Request", value="Click to request a song by voice", interactive=False)
             # Like/Dislike buttons for current track
             with gr.Row():
@@ -1690,10 +1737,10 @@ with gr.Blocks(css=custom_css, title="AI Radio 🎵", theme=gr.themes.Soft(), he
                 js="() => { if(window.cancelNextSegment) window.cancelNextSegment(); }"
             )
-            # Voice input button - direct click without .then() chain
             voice_btn.click(
                 fn=handle_voice_request,
-                inputs=[],
                 outputs=[voice_status, audio_output, music_player, player_timer]
             )

     return segments
+def handle_voice_request(audio_file):
+    """Handle voice input request for song from uploaded audio file"""
+    if not audio_file:
+        return "⚠️ Please record your voice request first!", None, "", gr.Timer(value=0, active=False)
     try:
+        # Use speech_recognition to process the audio file
+        import speech_recognition as sr
+        from pydub import AudioSegment
+        recognizer = sr.Recognizer()
+        # Convert audio to WAV format if needed (speech_recognition requires WAV)
+        audio_path = audio_file
+        if isinstance(audio_file, tuple):
+            # Gradio Audio returns (sample_rate, audio_data) or filepath
+            audio_path = audio_file[1] if len(audio_file) > 1 else audio_file[0]
+        # If it's not a WAV file, convert it
+        if audio_path and not audio_path.endswith('.wav'):
+            try:
+                # Load and convert to WAV
+                audio = AudioSegment.from_file(audio_path)
+                wav_path = audio_path.rsplit('.', 1)[0] + '.wav'
+                audio.export(wav_path, format="wav")
+                audio_path = wav_path
+            except Exception as e:
+                print(f"⚠️ Could not convert audio: {e}, trying original file")
+        # Load audio file for recognition
+        try:
+            with sr.AudioFile(audio_path) as source:
+                audio = recognizer.record(source)
+        except Exception as e:
+            # Try with pydub conversion first
+            try:
+                audio_seg = AudioSegment.from_file(audio_path)
+                wav_temp = os.path.join(AUDIO_DIR, f"temp_voice_{int(time.time())}.wav")
+                audio_seg.export(wav_temp, format="wav")
+                with sr.AudioFile(wav_temp) as source:
+                    audio = recognizer.record(source)
+                audio_path = wav_temp
+            except Exception as conv_e:
+                return f"❌ Could not process audio file: {conv_e}. Please try recording again.", None, "", gr.Timer(value=0, active=False)
+        # Recognize speech using Google's API
+        try:
+            recognized_text = recognizer.recognize_google(audio)
+            print(f"🎤 Recognized: {recognized_text}")
+        except sr.UnknownValueError:
+            return "❌ Could not understand audio. Please speak clearly and try again.", None, "", gr.Timer(value=0, active=False)
+        except sr.RequestError as e:
+            return f"❌ Error with speech recognition service: {e}. Please try again.", None, "", gr.Timer(value=0, active=False)
         # Process the request
         song_request = voice_input_service.process_song_request(recognized_text)
         tracks = agent.music_server.search_by_request(song_request)
         if not tracks:
+            return f"❌ Could not find music for: '{recognized_text}'. Try saying something like 'play pop music' or 'play a song by [artist name]'!", None, "", gr.Timer(value=0, active=False)
         # Get the first matching track
         track = tracks[0]
                 stop_btn = gr.Button("⏹️ Stop", variant="stop", size="lg", elem_classes="control-button")
             with gr.Row():
+                voice_audio = gr.Audio(
+                    label="🎤 Record Your Song Request",
+                    type="filepath",
+                    sources=["microphone"],
+                    format="wav"
+                )
+                voice_btn = gr.Button("🎤 Process Voice Request", variant="primary", size="lg", elem_classes="control-button")
+                voice_status = gr.Textbox(label="Voice Request Status", value="Record your voice request above, then click the button", interactive=False)
             # Like/Dislike buttons for current track
             with gr.Row():
                 js="() => { if(window.cancelNextSegment) window.cancelNextSegment(); }"
             )
+            # Voice input button - process recorded audio
             voice_btn.click(
                 fn=handle_voice_request,
+                inputs=[voice_audio],
                 outputs=[voice_status, audio_output, music_player, player_timer]
             )