File size: 3,181 Bytes
48f585b
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
import os
import gradio as gr
from groq import Groq
from pathlib import Path

# Initialize Groq client using the secret stored in Hugging Face
api_key = os.environ.get("GROQ_API_KEY")
client = Groq(api_key=api_key)

def process_voice_assistant(audio_input):
    if audio_input is None:
        return "No audio provided.", "Please record or upload audio first.", None
    
    try:
        # --- 1. Audio to Text (Transcription) ---
        with open(audio_input, "rb") as file:
            transcription = client.audio.transcriptions.create(
                file=(audio_input, file.read()),
                model="whisper-large-v3",
                temperature=0,
            )
        user_text = transcription.text
        
        # --- 2. Text Generation ---
        # We include strict instructions to stay under the Groq TTS token limit
        completion = client.chat.completions.create(
            model="llama-3.1-8b-instant",
            messages=[
                {
                    "role": "system", 
                    "content": "You are a concise voice assistant. Your response MUST be under 50 words."
                },
                {"role": "user", "content": user_text}
            ],
            max_tokens=150,
            temperature=0.5,
        )
        ai_response_text = completion.choices[0].message.content

        # --- 3. Text to Audio (Speech Synthesis) ---
        # Safety: Truncate text to ensure it doesn't exceed the 1200 token TPM limit
        safe_audio_text = ai_response_text[:1000] 
        speech_file_path = "output_response.wav"
        
        response = client.audio.speech.create(
            model="canopylabs/orpheus-v1-english",
            voice="autumn",
            response_format="wav",
            input=safe_audio_text,
        )
        response.write_to_file(speech_file_path)

        return user_text, ai_response_text, speech_file_path

    except Exception as e:
        error_str = str(e)
        if "413" in error_str:
            return "Audio processed", "The AI response was too long for the voice engine. Try a shorter question.", None
        return "Error", error_str, None

# --- Gradio Interface ---
with gr.Blocks(theme=gr.themes.Soft()) as demo:
    gr.Markdown("# 🎙️ Groq Voice-to-Voice Assistant")
    gr.Markdown("Deploying on Hugging Face Spaces using Whisper, Llama 3.1, and Orpheus.")
    
    with gr.Row():
        with gr.Column():
            audio_in = gr.Audio(
                label="Input Audio (Mic or Upload)", 
                type="filepath", 
                sources=["microphone", "upload"]
            )
            submit_btn = gr.Button("Submit", variant="primary")
            
        with gr.Column():
            user_transcript = gr.Textbox(label="Transcription")
            ai_transcript = gr.Textbox(label="AI Response")
            audio_out = gr.Audio(label="AI Voice Output", autoplay=True)

    submit_btn.click(
        fn=process_voice_assistant,
        inputs=[audio_in],
        outputs=[user_transcript, ai_transcript, audio_out]
    )

# For Hugging Face, we just call launch() without specific ports
if __name__ == "__main__":
    demo.launch()