Spaces:
Sleeping
Sleeping
| import os | |
| import gradio as gr | |
| from groq import Groq | |
| from pathlib import Path | |
| # Initialize Groq client using the secret stored in Hugging Face | |
| api_key = os.environ.get("GROQ_API_KEY") | |
| client = Groq(api_key=api_key) | |
| def process_voice_assistant(audio_input): | |
| if audio_input is None: | |
| return "No audio provided.", "Please record or upload audio first.", None | |
| try: | |
| # --- 1. Audio to Text (Transcription) --- | |
| with open(audio_input, "rb") as file: | |
| transcription = client.audio.transcriptions.create( | |
| file=(audio_input, file.read()), | |
| model="whisper-large-v3", | |
| temperature=0, | |
| ) | |
| user_text = transcription.text | |
| # --- 2. Text Generation --- | |
| # We include strict instructions to stay under the Groq TTS token limit | |
| completion = client.chat.completions.create( | |
| model="llama-3.1-8b-instant", | |
| messages=[ | |
| { | |
| "role": "system", | |
| "content": "You are a concise voice assistant. Your response MUST be under 50 words." | |
| }, | |
| {"role": "user", "content": user_text} | |
| ], | |
| max_tokens=150, | |
| temperature=0.5, | |
| ) | |
| ai_response_text = completion.choices[0].message.content | |
| # --- 3. Text to Audio (Speech Synthesis) --- | |
| # Safety: Truncate text to ensure it doesn't exceed the 1200 token TPM limit | |
| safe_audio_text = ai_response_text[:1000] | |
| speech_file_path = "output_response.wav" | |
| response = client.audio.speech.create( | |
| model="canopylabs/orpheus-v1-english", | |
| voice="autumn", | |
| response_format="wav", | |
| input=safe_audio_text, | |
| ) | |
| response.write_to_file(speech_file_path) | |
| return user_text, ai_response_text, speech_file_path | |
| except Exception as e: | |
| error_str = str(e) | |
| if "413" in error_str: | |
| return "Audio processed", "The AI response was too long for the voice engine. Try a shorter question.", None | |
| return "Error", error_str, None | |
| # --- Gradio Interface --- | |
| with gr.Blocks(theme=gr.themes.Soft()) as demo: | |
| gr.Markdown("# 🎙️ Groq Voice-to-Voice Assistant") | |
| gr.Markdown("Deploying on Hugging Face Spaces using Whisper, Llama 3.1, and Orpheus.") | |
| with gr.Row(): | |
| with gr.Column(): | |
| audio_in = gr.Audio( | |
| label="Input Audio (Mic or Upload)", | |
| type="filepath", | |
| sources=["microphone", "upload"] | |
| ) | |
| submit_btn = gr.Button("Submit", variant="primary") | |
| with gr.Column(): | |
| user_transcript = gr.Textbox(label="Transcription") | |
| ai_transcript = gr.Textbox(label="AI Response") | |
| audio_out = gr.Audio(label="AI Voice Output", autoplay=True) | |
| submit_btn.click( | |
| fn=process_voice_assistant, | |
| inputs=[audio_in], | |
| outputs=[user_transcript, ai_transcript, audio_out] | |
| ) | |
| # For Hugging Face, we just call launch() without specific ports | |
| if __name__ == "__main__": | |
| demo.launch() |