wahab5763's picture
Create app.py
48f585b verified
import os
import gradio as gr
from groq import Groq
from pathlib import Path
# Initialize Groq client using the secret stored in Hugging Face
api_key = os.environ.get("GROQ_API_KEY")
client = Groq(api_key=api_key)
def process_voice_assistant(audio_input):
if audio_input is None:
return "No audio provided.", "Please record or upload audio first.", None
try:
# --- 1. Audio to Text (Transcription) ---
with open(audio_input, "rb") as file:
transcription = client.audio.transcriptions.create(
file=(audio_input, file.read()),
model="whisper-large-v3",
temperature=0,
)
user_text = transcription.text
# --- 2. Text Generation ---
# We include strict instructions to stay under the Groq TTS token limit
completion = client.chat.completions.create(
model="llama-3.1-8b-instant",
messages=[
{
"role": "system",
"content": "You are a concise voice assistant. Your response MUST be under 50 words."
},
{"role": "user", "content": user_text}
],
max_tokens=150,
temperature=0.5,
)
ai_response_text = completion.choices[0].message.content
# --- 3. Text to Audio (Speech Synthesis) ---
# Safety: Truncate text to ensure it doesn't exceed the 1200 token TPM limit
safe_audio_text = ai_response_text[:1000]
speech_file_path = "output_response.wav"
response = client.audio.speech.create(
model="canopylabs/orpheus-v1-english",
voice="autumn",
response_format="wav",
input=safe_audio_text,
)
response.write_to_file(speech_file_path)
return user_text, ai_response_text, speech_file_path
except Exception as e:
error_str = str(e)
if "413" in error_str:
return "Audio processed", "The AI response was too long for the voice engine. Try a shorter question.", None
return "Error", error_str, None
# --- Gradio Interface ---
with gr.Blocks(theme=gr.themes.Soft()) as demo:
gr.Markdown("# 🎙️ Groq Voice-to-Voice Assistant")
gr.Markdown("Deploying on Hugging Face Spaces using Whisper, Llama 3.1, and Orpheus.")
with gr.Row():
with gr.Column():
audio_in = gr.Audio(
label="Input Audio (Mic or Upload)",
type="filepath",
sources=["microphone", "upload"]
)
submit_btn = gr.Button("Submit", variant="primary")
with gr.Column():
user_transcript = gr.Textbox(label="Transcription")
ai_transcript = gr.Textbox(label="AI Response")
audio_out = gr.Audio(label="AI Voice Output", autoplay=True)
submit_btn.click(
fn=process_voice_assistant,
inputs=[audio_in],
outputs=[user_transcript, ai_transcript, audio_out]
)
# For Hugging Face, we just call launch() without specific ports
if __name__ == "__main__":
demo.launch()