STARBORN commited on
Commit
1a68b9a
·
verified ·
1 Parent(s): 8f6cad0

Update app.py

Browse files

import gradio as gr
import os
import numpy as np
import librosa
import asyncio
import edge_tts
import soundfile as sf
from groq import Groq
from fastrtc import WebRTC, ReplyOnPause, get_hf_turn_credentials

# Initialize Groq
client = Groq(api_key=os.environ.get("GROQ_API_KEY"))

async def text_to_speech_logic(text):
communicate = edge_tts.Communicate(text, "en-US-AndrewNeural")
await communicate.save("temp_op.mp3")
audio, sr = librosa.load("temp_op.mp3", sr=16000)
# FastRTC expects (channels, samples)
if len(audio.shape) == 1:
audio = audio.reshape(1, -1)
return sr, audio

def process_audio(audio: tuple[int, np.ndarray]):
sr, y = audio
sf.write("input.wav", y.T, sr) # Transpose for standard wav format

with open("input.wav", "rb") as file:
transcription = client.audio.transcriptions.create(
file=("input.wav", file.read()),
model="whisper-large-v3-turbo",
)

response = client.chat.completions.create(
model="llama-3.3-70b-versatile",
messages=[
{"role": "system", "content": "You are a concise voice assistant. 1-2 sentences only."},
{"role": "user", "content": transcription.text}
]
)
reply_text = response.choices[0].message.content
return asyncio.run(text_to_speech_logic(reply_text))

# Manual UI Build
with gr.Blocks() as demo:
gr.Markdown("# 🎙️ Voice Agent Live")
gr.Markdown("Click the button below to start talking.")

# We use the WebRTC component directly for maximum compatibility
webrtc_comp = WebRTC(
label="Voice Chat",
mode="send-receive",
modality="audio",
rtc_configuration=get_hf_turn_credentials()
)

# Connect the logic
webrtc_comp.stream(
fn=ReplyOnPause(process_audio),
inputs=[webrtc_comp],
outputs=[webrtc_comp]
)

if __name__ == "__main__":
demo.launch()

Files changed (1) hide show
  1. app.py +2 -49
app.py CHANGED
@@ -1,50 +1,3 @@
1
- import gradio as gr
2
- import os
3
- import numpy as np
4
- import librosa
5
- import asyncio
6
- import edge_tts
7
- import soundfile as sf
8
- from groq import Groq
9
- from fastrtc import Stream, ReplyOnPause, get_hf_turn_credentials
10
-
11
- # Initialize Groq
12
- client = Groq(api_key=os.environ.get("GROQ_API_KEY"))
13
-
14
- async def text_to_speech_stream(text):
15
- communicate = edge_tts.Communicate(text, "en-US-AndrewNeural")
16
- await communicate.save("temp_op.mp3")
17
- audio, sr = librosa.load("temp_op.mp3", sr=16000)
18
- return sr, audio
19
-
20
- def process_audio(audio: tuple[int, np.ndarray]):
21
- sr, y = audio
22
- sf.write("input.wav", y, sr)
23
- with open("input.wav", "rb") as file:
24
- transcription = client.audio.transcriptions.create(
25
- file=("input.wav", file.read()),
26
- model="whisper-large-v3-turbo",
27
- )
28
-
29
- response = client.chat.completions.create(
30
- model="llama-3.3-70b-versatile",
31
- messages=[
32
- {"role": "system", "content": "You are a concise voice assistant. 1-2 sentences only."},
33
- {"role": "user", "content": transcription.text}
34
- ]
35
- )
36
- reply_text = response.choices[0].message.content
37
- return asyncio.run(text_to_speech_stream(reply_text))
38
-
39
- stream = Stream(
40
- handler=ReplyOnPause(process_audio),
41
- modality="audio",
42
- mode="send-receive",
43
- rtc_configuration=get_hf_turn_credentials()
44
- )
45
-
46
- with gr.Blocks() as demo:
47
- gr.Markdown("# 🎙️ Voice Agent Live")
48
- stream.ui.launch()
49
-
50
  demo.launch()
 
1
+ im
2
+
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3
  demo.launch()