File size: 3,855 Bytes
0bb4017
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
import os
import tempfile
import io
import gradio as gr
from openai import OpenAI

# Read API key from Space secret
client = OpenAI(api_key=os.getenv("OPENAI_API_KEY"))

SYSTEM_PROMPT = "You are a friendly, concise voice assistant. Keep replies short when spoken (~2-3 sentences)."

def ensure_bytesio(obj):
    if isinstance(obj, (bytes, bytearray)):
        return io.BytesIO(obj)
    return obj

def chat_fn(history, mic_audio, text_input, voice="alloy", model=None, temperature=0.6):
    messages = [{"role": "system", "content": SYSTEM_PROMPT}]
    # Convert history (list of [user, assistant]) -> messages
    for pair in history or []:
        if pair[0]:
            messages.append({"role": "user", "content": pair[0]})
        if len(pair) > 1 and pair[1]:
            messages.append({"role": "assistant", "content": pair[1]})

    user_text = (text_input or "").strip()

    # If user provided audio, transcribe it
    transcript_text = None
    if mic_audio:
        # mic_audio is a file path (type='filepath')
        with open(mic_audio, "rb") as f:
            tr = client.audio.transcriptions.create(
                model="whisper-1",
                file=f,
                response_format="text"
            )
        transcript_text = tr if isinstance(tr, str) else getattr(tr, "text", None)
        if transcript_text:
            user_text = (user_text + " " + transcript_text).strip() if user_text else transcript_text

    if not user_text:
        return history, None, "Please speak or type something."

    messages.append({"role": "user", "content": user_text})
    chosen_model = model or os.getenv("OPENAI_MODEL", "gpt-4o-mini")

    comp = client.chat.completions.create(
        model=chosen_model,
        messages=messages,
        temperature=float(temperature)
    )
    reply = comp.choices[0].message.content.strip()

    # TTS
    speech = client.audio.speech.create(
        model="gpt-4o-mini-tts",
        voice=voice,
        input=reply
    )
    # Save to a temp mp3
    with tempfile.NamedTemporaryFile(delete=False, suffix=".mp3") as tmp:
        tmp.write(speech.read())
        tts_path = tmp.name

    new_hist = (history or []) + [[user_text, reply]]
    return new_hist, tts_path, transcript_text or ""

with gr.Blocks(theme=gr.themes.Soft(), fill_height=True) as demo:
    gr.Markdown("# 🎙️ Voice Chat (Hugging Face Space)
Talk to the AI and it talks back.")

    with gr.Row():
        chatbot = gr.Chatbot(height=340, type="messages")
    with gr.Row():
        audio_in = gr.Audio(sources=["microphone"], type="filepath", label="Mic (press to record)")
    with gr.Row():
        text_in = gr.Textbox(placeholder="...or type here and press Enter", scale=2)
        voice = gr.Dropdown(choices=["alloy","verse","amber","aria","bright","sage","sol","luna","coral","spark","horizon"], value="alloy", label="Voice", scale=1)
    with gr.Row():
        model = gr.Textbox(value="", placeholder="Model (leave blank for gpt-4o-mini)", label="Model override", scale=1)
        temp = gr.Slider(0.0, 1.5, value=0.6, step=0.1, label="Creativity")
    with gr.Row():
        audio_out = gr.Audio(label="AI Voice Reply", autoplay=True)
        transcript = gr.Textbox(label="Last transcription", interactive=False)

    state = gr.State([])

    def _chat(state_hist, audio, text, voice, model, temp):
        return chat_fn(state_hist, audio, text, voice, model, temp)

    go = gr.Button("Send / Speak")
    clear = gr.Button("Clear")

    go.click(_chat, inputs=[state, audio_in, text_in, voice, model, temp], outputs=[state, audio_out, transcript])
    text_in.submit(_chat, inputs=[state, audio_in, text_in, voice, model, temp], outputs=[state, audio_out, transcript])
    clear.click(fn=lambda: ([], None, ""), outputs=[state, audio_out, transcript])

if __name__ == "__main__":
    demo.launch()