Spaces:

STARBORN
/

voice_agent_tutorial

Sleeping

App Files Files Community

STARBORN commited on 17 days ago

Commit

2087bf7

verified ·

1 Parent(s): 7cf18aa

Upload app.py

Browse files

Files changed (1) hide show

app.py +89 -14

app.py CHANGED Viewed

@@ -6,7 +6,13 @@ import asyncio
 import edge_tts
 import soundfile as sf
 from groq import Groq
-from fastrtc import WebRTC, ReplyOnPause, get_hf_turn_credentials
 # Initialize Groq
 client = Groq(api_key=os.environ.get("GROQ_API_KEY"))
@@ -45,19 +51,88 @@ def process_audio(audio: tuple[int, np.ndarray]):
     reply_text = response.choices[0].message.content
     return asyncio.run(text_to_speech_logic(reply_text))
-with gr.Blocks() as demo:
-    gr.Markdown("# 🎙️ Voice Agent Live (CPU)")
-    webrtc_comp = WebRTC(
-        label="Voice Chat",
-        mode="send-receive",
-        modality="audio",
-        rtc_configuration=get_hf_turn_credentials()
-    )
-    webrtc_comp.stream(
-        fn=ReplyOnPause(process_audio),
-        inputs=[webrtc_comp],
-        outputs=[webrtc_comp]
     )
 if __name__ == "__main__":
-    demo.launch()

 import edge_tts
 import soundfile as sf
 from groq import Groq
+try:
+    from fastrtc import WebRTC, ReplyOnPause, get_hf_turn_credentials
+    FASTRTC_AVAILABLE = True
+except ImportError:
+    FASTRTC_AVAILABLE = False
+    print("FastRTC not available, using fallback UI")
 # Initialize Groq
 client = Groq(api_key=os.environ.get("GROQ_API_KEY"))
     reply_text = response.choices[0].message.content
     return asyncio.run(text_to_speech_logic(reply_text))
+# Fallback function for regular audio interface
+def process_audio_file(audio_file):
+    if audio_file is None:
+        return None, "Please record or upload audio"
+    # Load audio
+    y, sr = librosa.load(audio_file, sr=16000)
+    sf.write("input.wav", y, sr)
+    # Transcribe
+    with open("input.wav", "rb") as file:
+        transcription = client.audio.transcriptions.create(
+            file=("input.wav", file.read()),
+            model="whisper-large-v3-turbo",
+        )
+    # Get response
+    response = client.chat.completions.create(
+        model="llama-3.3-70b-versatile",
+        messages=[
+            {"role": "system", "content": "You are a concise voice assistant. Give 1-sentence answers."},
+            {"role": "user", "content": transcription.text}
+        ]
     )
+    reply_text = response.choices[0].message.content
+    # Generate speech
+    sr_out, audio_out = asyncio.run(text_to_speech_logic(reply_text))
+    return "temp_op.mp3", f"**You said:** {transcription.text}\n\n**Assistant:** {reply_text}"
+# Create the interface
+with gr.Blocks(title="Voice Agent Live") as demo:
+    gr.Markdown("# 🎙️ Voice Agent Live")
+    gr.Markdown("Speak to the AI assistant and get voice responses!")
+    if FASTRTC_AVAILABLE:
+        gr.Markdown("### Real-time Voice Chat (WebRTC)")
+        try:
+            webrtc_comp = WebRTC(
+                label="Voice Chat",
+                mode="send-receive",
+                modality="audio",
+                rtc_configuration=get_hf_turn_credentials()
+            )
+            webrtc_comp.stream(
+                fn=ReplyOnPause(process_audio),
+                inputs=[webrtc_comp],
+                outputs=[webrtc_comp]
+            )
+        except Exception as e:
+            gr.Markdown(f"⚠️ WebRTC Error: {str(e)}")
+            gr.Markdown("### Using fallback mode below")
+            FASTRTC_AVAILABLE = False
+    if not FASTRTC_AVAILABLE:
+        gr.Markdown("### Voice Chat (Record/Upload)")
+        with gr.Row():
+            with gr.Column():
+                audio_input = gr.Audio(
+                    sources=["microphone", "upload"],
+                    type="filepath",
+                    label="Record or Upload Audio"
+                )
+                submit_btn = gr.Button("🎤 Process Audio", variant="primary", size="lg")
+            with gr.Column():
+                audio_output = gr.Audio(label="Assistant Response", type="filepath")
+                text_output = gr.Markdown()
+        submit_btn.click(
+            fn=process_audio_file,
+            inputs=[audio_input],
+            outputs=[audio_output, text_output]
+        )
+        gr.Examples(
+            examples=[],
+            inputs=audio_input,
+            label="Try recording your voice!"
+        )
 if __name__ == "__main__":
+    demo.launch(share=False)