Spaces:

STARBORN
/

voice_agent_tutorial

Sleeping

App Files Files Community

STARBORN commited on 19 days ago

Commit

fba91f6

verified ·

1 Parent(s): 1a68b9a

Update app.py

Browse files

Files changed (1) hide show

app.py +61 -2

app.py CHANGED Viewed

@@ -1,3 +1,62 @@
-im
-demo.launch()

+import gradio as gr
+import os
+import numpy as np
+import librosa
+import asyncio
+import edge_tts
+import soundfile as sf
+from groq import Groq
+from fastrtc import WebRTC, ReplyOnPause, get_hf_turn_credentials
+# Initialize Groq
+client = Groq(api_key=os.environ.get("GROQ_API_KEY"))
+async def text_to_speech_logic(text):
+    communicate = edge_tts.Communicate(text, "en-US-AndrewNeural")
+    await communicate.save("temp_op.mp3")
+    audio, sr = librosa.load("temp_op.mp3", sr=16000)
+    # Ensure audio is in the correct shape (1, samples) for FastRTC
+    if len(audio.shape) == 1:
+        audio = audio.reshape(1, -1)
+    return sr, audio
+def process_audio(audio: tuple[int, np.ndarray]):
+    sr, y = audio
+    # FastRTC audio can be (samples, channels), we need (samples,)
+    if len(y.shape) > 1:
+        y = y.mean(axis=1)
+    sf.write("input.wav", y, sr)
+    with open("input.wav", "rb") as file:
+        transcription = client.audio.transcriptions.create(
+            file=("input.wav", file.read()),
+            model="whisper-large-v3-turbo",
+        )
+    response = client.chat.completions.create(
+        model="llama-3.3-70b-versatile",
+        messages=[
+            {"role": "system", "content": "You are a concise voice assistant. Give 1-sentence answers."},
+            {"role": "user", "content": transcription.text}
+        ]
+    )
+    reply_text = response.choices[0].message.content
+    return asyncio.run(text_to_speech_logic(reply_text))
+with gr.Blocks() as demo:
+    gr.Markdown("# 🎙️ Voice Agent Live (CPU)")
+    webrtc_comp = WebRTC(
+        label="Voice Chat",
+        mode="send-receive",
+        modality="audio",
+        rtc_configuration=get_hf_turn_credentials()
+    )
+    webrtc_comp.stream(
+        fn=ReplyOnPause(process_audio),
+        inputs=[webrtc_comp],
+        outputs=[webrtc_comp]
+    )
+if __name__ == "__main__":
+    demo.launch()