File size: 3,853 Bytes
1acda03
 
 
ed85784
db61141
 
1acda03
 
656e44c
1acda03
 
 
f7ee472
1acda03
 
db61141
 
606324d
 
 
 
db61141
656e44c
20c1db9
606324d
 
1acda03
20c1db9
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1acda03
 
606324d
 
 
1acda03
606324d
1acda03
db61141
fca14be
1acda03
20c1db9
 
 
 
 
 
 
 
 
 
 
 
 
606324d
 
 
 
 
 
db61141
fca14be
1acda03
20c1db9
1acda03
 
20c1db9
1acda03
 
fca14be
db61141
20c1db9
 
db61141
20c1db9
 
 
fca14be
20c1db9
 
 
 
 
 
db61141
1acda03
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
import os
import asyncio
import wave
import gradio as gr
import tempfile
from groq import Groq
from google import genai
from google.genai import types

BOT_NAME = "Nilla"
MOTOR_NAME = "Nilla-2026 GPT motor"
PROVIDER = "HumanV lab"
POR = os.environ.get("POR")
MODEL_ID = os.environ.get("MODEL_VERSION")
UK_SERVER_API = os.environ.get("UK_SERVER_API")
GROQ_API_KEY = os.environ.get("GROQ_API_KEY")

client_gemini = genai.Client(
    api_key=UK_SERVER_API,
    http_options={"api_version": "v1alpha"}
)
client_groq = Groq(api_key=GROQ_API_KEY)

async def nilla_engine(audio_path, text_input, chat_history):
    if chat_history is None:
        chat_history = []
    
    user_text = ""
    is_voice = False

    if text_input and text_input.strip():
        user_text = text_input
        is_voice = False
    elif audio_path:
        with open(audio_path, "rb") as file:
            transcription = client_groq.audio.transcriptions.create(
                file=(audio_path, file.read()),
                model="whisper-large-v3",
                temperature=0,
                response_format="verbose_json",
            )
        user_text = transcription.text
        is_voice = True
    else:
        return None, "", "", chat_history

    output_path = None
    model_response_text = ""
    current_turns = chat_history + [{"role": "user", "parts": [{"text": user_text}]}]

    config = {
        "response_modalities": ["AUDIO"],
        "system_instruction": POR,
        "enable_affective_dialog": True,
        "output_audio_transcription": {}
    }

    try:
        async with client_gemini.aio.live.connect(model=MODEL_ID, config=config) as session:
            await session.send_client_content(turns=current_turns, turn_complete=True)

            if is_voice:
                temp_file = tempfile.NamedTemporaryFile(delete=False, suffix=".wav")
                output_path = temp_file.name
                with wave.open(output_path, "wb") as wav:
                    wav.setnchannels(1)
                    wav.setsampwidth(2)
                    wav.setframerate(24000)
                    async for response in session.receive():
                        if response.data:
                            wav.writeframes(response.data)
                        if response.server_content and response.server_content.output_transcription:
                            model_response_text += response.server_content.output_transcription.text
            else:
                async for response in session.receive():
                    if response.server_content and response.server_content.output_transcription:
                        model_response_text += response.server_content.output_transcription.text

        new_history = current_turns + [{"role": "model", "parts": [{"text": model_response_text}]}]
        return output_path, user_text, model_response_text, new_history
    except Exception:
        return None, user_text, "Error", chat_history

def run_interface(audio_file, text_input, chat_history):
    loop = asyncio.new_event_loop()
    asyncio.set_event_loop(loop)
    return loop.run_until_complete(nilla_engine(audio_file, text_input, chat_history))

with gr.Blocks(title=BOT_NAME) as demo:
    history_component = gr.JSON(value=[], visible=False)
    with gr.Row():
        in_audio = gr.Audio(label="Audio", type="filepath")
        in_text = gr.Textbox(label="Text")
    with gr.Row():
        out_audio = gr.Audio(label="Voice Resp")
        out_user = gr.Textbox(label="User Text")
        out_nilla = gr.Textbox(label="Nilla Text")
    
    btn = gr.Button("Process")
    btn.click(
        fn=run_interface,
        inputs=[in_audio, in_text, history_component],
        outputs=[out_audio, out_user, out_nilla, history_component],
        api_name="run_interface"
    )

if __name__ == "__main__":
    demo.launch()