Spaces:
Sleeping
Sleeping
Nikita Makarov
commited on
Commit
·
11a8acd
1
Parent(s):
14b0c7b
Fix voice recording: use Gradio Audio component for browser-based recording
Browse files- src/app.py +61 -14
src/app.py
CHANGED
|
@@ -996,18 +996,59 @@ def split_text_into_segments(text: str, max_sentences: int = 2) -> List[str]:
|
|
| 996 |
|
| 997 |
return segments
|
| 998 |
|
| 999 |
-
def handle_voice_request():
|
| 1000 |
-
"""Handle voice input request for song"""
|
| 1001 |
-
|
| 1002 |
-
|
| 1003 |
-
return "⚠️ Voice input is not available. Please install PortAudio and pyaudio.\n\nSee INSTALL_VOICE_INPUT.md for instructions.\n\nYou can still request songs by typing in preferences!", None, ""
|
| 1004 |
|
| 1005 |
try:
|
| 1006 |
-
#
|
| 1007 |
-
|
|
|
|
| 1008 |
|
| 1009 |
-
|
| 1010 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1011 |
|
| 1012 |
# Process the request
|
| 1013 |
song_request = voice_input_service.process_song_request(recognized_text)
|
|
@@ -1017,7 +1058,7 @@ def handle_voice_request():
|
|
| 1017 |
tracks = agent.music_server.search_by_request(song_request)
|
| 1018 |
|
| 1019 |
if not tracks:
|
| 1020 |
-
return f"❌ Could not find music for: '{recognized_text}'. Try saying something like 'play pop music' or 'play a song by [artist name]'!", None, ""
|
| 1021 |
|
| 1022 |
# Get the first matching track
|
| 1023 |
track = tracks[0]
|
|
@@ -1633,8 +1674,14 @@ with gr.Blocks(css=custom_css, title="AI Radio 🎵", theme=gr.themes.Soft(), he
|
|
| 1633 |
stop_btn = gr.Button("⏹️ Stop", variant="stop", size="lg", elem_classes="control-button")
|
| 1634 |
|
| 1635 |
with gr.Row():
|
| 1636 |
-
|
| 1637 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1638 |
|
| 1639 |
# Like/Dislike buttons for current track
|
| 1640 |
with gr.Row():
|
|
@@ -1690,10 +1737,10 @@ with gr.Blocks(css=custom_css, title="AI Radio 🎵", theme=gr.themes.Soft(), he
|
|
| 1690 |
js="() => { if(window.cancelNextSegment) window.cancelNextSegment(); }"
|
| 1691 |
)
|
| 1692 |
|
| 1693 |
-
# Voice input button -
|
| 1694 |
voice_btn.click(
|
| 1695 |
fn=handle_voice_request,
|
| 1696 |
-
inputs=[],
|
| 1697 |
outputs=[voice_status, audio_output, music_player, player_timer]
|
| 1698 |
)
|
| 1699 |
|
|
|
|
| 996 |
|
| 997 |
return segments
|
| 998 |
|
| 999 |
+
def handle_voice_request(audio_file):
|
| 1000 |
+
"""Handle voice input request for song from uploaded audio file"""
|
| 1001 |
+
if not audio_file:
|
| 1002 |
+
return "⚠️ Please record your voice request first!", None, "", gr.Timer(value=0, active=False)
|
|
|
|
| 1003 |
|
| 1004 |
try:
|
| 1005 |
+
# Use speech_recognition to process the audio file
|
| 1006 |
+
import speech_recognition as sr
|
| 1007 |
+
from pydub import AudioSegment
|
| 1008 |
|
| 1009 |
+
recognizer = sr.Recognizer()
|
| 1010 |
+
|
| 1011 |
+
# Convert audio to WAV format if needed (speech_recognition requires WAV)
|
| 1012 |
+
audio_path = audio_file
|
| 1013 |
+
if isinstance(audio_file, tuple):
|
| 1014 |
+
# Gradio Audio returns (sample_rate, audio_data) or filepath
|
| 1015 |
+
audio_path = audio_file[1] if len(audio_file) > 1 else audio_file[0]
|
| 1016 |
+
|
| 1017 |
+
# If it's not a WAV file, convert it
|
| 1018 |
+
if audio_path and not audio_path.endswith('.wav'):
|
| 1019 |
+
try:
|
| 1020 |
+
# Load and convert to WAV
|
| 1021 |
+
audio = AudioSegment.from_file(audio_path)
|
| 1022 |
+
wav_path = audio_path.rsplit('.', 1)[0] + '.wav'
|
| 1023 |
+
audio.export(wav_path, format="wav")
|
| 1024 |
+
audio_path = wav_path
|
| 1025 |
+
except Exception as e:
|
| 1026 |
+
print(f"⚠️ Could not convert audio: {e}, trying original file")
|
| 1027 |
+
|
| 1028 |
+
# Load audio file for recognition
|
| 1029 |
+
try:
|
| 1030 |
+
with sr.AudioFile(audio_path) as source:
|
| 1031 |
+
audio = recognizer.record(source)
|
| 1032 |
+
except Exception as e:
|
| 1033 |
+
# Try with pydub conversion first
|
| 1034 |
+
try:
|
| 1035 |
+
audio_seg = AudioSegment.from_file(audio_path)
|
| 1036 |
+
wav_temp = os.path.join(AUDIO_DIR, f"temp_voice_{int(time.time())}.wav")
|
| 1037 |
+
audio_seg.export(wav_temp, format="wav")
|
| 1038 |
+
with sr.AudioFile(wav_temp) as source:
|
| 1039 |
+
audio = recognizer.record(source)
|
| 1040 |
+
audio_path = wav_temp
|
| 1041 |
+
except Exception as conv_e:
|
| 1042 |
+
return f"❌ Could not process audio file: {conv_e}. Please try recording again.", None, "", gr.Timer(value=0, active=False)
|
| 1043 |
+
|
| 1044 |
+
# Recognize speech using Google's API
|
| 1045 |
+
try:
|
| 1046 |
+
recognized_text = recognizer.recognize_google(audio)
|
| 1047 |
+
print(f"🎤 Recognized: {recognized_text}")
|
| 1048 |
+
except sr.UnknownValueError:
|
| 1049 |
+
return "❌ Could not understand audio. Please speak clearly and try again.", None, "", gr.Timer(value=0, active=False)
|
| 1050 |
+
except sr.RequestError as e:
|
| 1051 |
+
return f"❌ Error with speech recognition service: {e}. Please try again.", None, "", gr.Timer(value=0, active=False)
|
| 1052 |
|
| 1053 |
# Process the request
|
| 1054 |
song_request = voice_input_service.process_song_request(recognized_text)
|
|
|
|
| 1058 |
tracks = agent.music_server.search_by_request(song_request)
|
| 1059 |
|
| 1060 |
if not tracks:
|
| 1061 |
+
return f"❌ Could not find music for: '{recognized_text}'. Try saying something like 'play pop music' or 'play a song by [artist name]'!", None, "", gr.Timer(value=0, active=False)
|
| 1062 |
|
| 1063 |
# Get the first matching track
|
| 1064 |
track = tracks[0]
|
|
|
|
| 1674 |
stop_btn = gr.Button("⏹️ Stop", variant="stop", size="lg", elem_classes="control-button")
|
| 1675 |
|
| 1676 |
with gr.Row():
|
| 1677 |
+
voice_audio = gr.Audio(
|
| 1678 |
+
label="🎤 Record Your Song Request",
|
| 1679 |
+
type="filepath",
|
| 1680 |
+
sources=["microphone"],
|
| 1681 |
+
format="wav"
|
| 1682 |
+
)
|
| 1683 |
+
voice_btn = gr.Button("🎤 Process Voice Request", variant="primary", size="lg", elem_classes="control-button")
|
| 1684 |
+
voice_status = gr.Textbox(label="Voice Request Status", value="Record your voice request above, then click the button", interactive=False)
|
| 1685 |
|
| 1686 |
# Like/Dislike buttons for current track
|
| 1687 |
with gr.Row():
|
|
|
|
| 1737 |
js="() => { if(window.cancelNextSegment) window.cancelNextSegment(); }"
|
| 1738 |
)
|
| 1739 |
|
| 1740 |
+
# Voice input button - process recorded audio
|
| 1741 |
voice_btn.click(
|
| 1742 |
fn=handle_voice_request,
|
| 1743 |
+
inputs=[voice_audio],
|
| 1744 |
outputs=[voice_status, audio_output, music_player, player_timer]
|
| 1745 |
)
|
| 1746 |
|