STARBORN commited on
Commit
fba91f6
·
verified ·
1 Parent(s): 1a68b9a

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +61 -2
app.py CHANGED
@@ -1,3 +1,62 @@
1
- im
 
 
 
 
 
 
 
 
2
 
3
- demo.launch()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ import os
3
+ import numpy as np
4
+ import librosa
5
+ import asyncio
6
+ import edge_tts
7
+ import soundfile as sf
8
+ from groq import Groq
9
+ from fastrtc import WebRTC, ReplyOnPause, get_hf_turn_credentials
10
 
11
+ # Initialize Groq
12
+ client = Groq(api_key=os.environ.get("GROQ_API_KEY"))
13
+
14
+ async def text_to_speech_logic(text):
15
+ communicate = edge_tts.Communicate(text, "en-US-AndrewNeural")
16
+ await communicate.save("temp_op.mp3")
17
+ audio, sr = librosa.load("temp_op.mp3", sr=16000)
18
+ # Ensure audio is in the correct shape (1, samples) for FastRTC
19
+ if len(audio.shape) == 1:
20
+ audio = audio.reshape(1, -1)
21
+ return sr, audio
22
+
23
+ def process_audio(audio: tuple[int, np.ndarray]):
24
+ sr, y = audio
25
+ # FastRTC audio can be (samples, channels), we need (samples,)
26
+ if len(y.shape) > 1:
27
+ y = y.mean(axis=1)
28
+
29
+ sf.write("input.wav", y, sr)
30
+
31
+ with open("input.wav", "rb") as file:
32
+ transcription = client.audio.transcriptions.create(
33
+ file=("input.wav", file.read()),
34
+ model="whisper-large-v3-turbo",
35
+ )
36
+
37
+ response = client.chat.completions.create(
38
+ model="llama-3.3-70b-versatile",
39
+ messages=[
40
+ {"role": "system", "content": "You are a concise voice assistant. Give 1-sentence answers."},
41
+ {"role": "user", "content": transcription.text}
42
+ ]
43
+ )
44
+ reply_text = response.choices[0].message.content
45
+ return asyncio.run(text_to_speech_logic(reply_text))
46
+
47
+ with gr.Blocks() as demo:
48
+ gr.Markdown("# 🎙️ Voice Agent Live (CPU)")
49
+ webrtc_comp = WebRTC(
50
+ label="Voice Chat",
51
+ mode="send-receive",
52
+ modality="audio",
53
+ rtc_configuration=get_hf_turn_credentials()
54
+ )
55
+ webrtc_comp.stream(
56
+ fn=ReplyOnPause(process_audio),
57
+ inputs=[webrtc_comp],
58
+ outputs=[webrtc_comp]
59
+ )
60
+
61
+ if __name__ == "__main__":
62
+ demo.launch()