import os import gradio as gr from groq import Groq from pathlib import Path # Initialize Groq client using the secret stored in Hugging Face api_key = os.environ.get("GROQ_API_KEY") client = Groq(api_key=api_key) def process_voice_assistant(audio_input): if audio_input is None: return "No audio provided.", "Please record or upload audio first.", None try: # --- 1. Audio to Text (Transcription) --- with open(audio_input, "rb") as file: transcription = client.audio.transcriptions.create( file=(audio_input, file.read()), model="whisper-large-v3", temperature=0, ) user_text = transcription.text # --- 2. Text Generation --- # We include strict instructions to stay under the Groq TTS token limit completion = client.chat.completions.create( model="llama-3.1-8b-instant", messages=[ { "role": "system", "content": "You are a concise voice assistant. Your response MUST be under 50 words." }, {"role": "user", "content": user_text} ], max_tokens=150, temperature=0.5, ) ai_response_text = completion.choices[0].message.content # --- 3. Text to Audio (Speech Synthesis) --- # Safety: Truncate text to ensure it doesn't exceed the 1200 token TPM limit safe_audio_text = ai_response_text[:1000] speech_file_path = "output_response.wav" response = client.audio.speech.create( model="canopylabs/orpheus-v1-english", voice="autumn", response_format="wav", input=safe_audio_text, ) response.write_to_file(speech_file_path) return user_text, ai_response_text, speech_file_path except Exception as e: error_str = str(e) if "413" in error_str: return "Audio processed", "The AI response was too long for the voice engine. Try a shorter question.", None return "Error", error_str, None # --- Gradio Interface --- with gr.Blocks(theme=gr.themes.Soft()) as demo: gr.Markdown("# 🎙️ Groq Voice-to-Voice Assistant") gr.Markdown("Deploying on Hugging Face Spaces using Whisper, Llama 3.1, and Orpheus.") with gr.Row(): with gr.Column(): audio_in = gr.Audio( label="Input Audio (Mic or Upload)", type="filepath", sources=["microphone", "upload"] ) submit_btn = gr.Button("Submit", variant="primary") with gr.Column(): user_transcript = gr.Textbox(label="Transcription") ai_transcript = gr.Textbox(label="AI Response") audio_out = gr.Audio(label="AI Voice Output", autoplay=True) submit_btn.click( fn=process_voice_assistant, inputs=[audio_in], outputs=[user_transcript, ai_transcript, audio_out] ) # For Hugging Face, we just call launch() without specific ports if __name__ == "__main__": demo.launch()