import os import asyncio import wave import gradio as gr import tempfile from groq import Groq from google import genai from google.genai import types BOT_NAME = "Nilla" MOTOR_NAME = "Nilla-2026 GPT motor" PROVIDER = "HumanV lab" POR = os.environ.get("POR") MODEL_ID = os.environ.get("MODEL_VERSION") UK_SERVER_API = os.environ.get("UK_SERVER_API") GROQ_API_KEY = os.environ.get("GROQ_API_KEY") client_gemini = genai.Client( api_key=UK_SERVER_API, http_options={"api_version": "v1alpha"} ) client_groq = Groq(api_key=GROQ_API_KEY) async def nilla_engine(audio_path, text_input, chat_history): if chat_history is None: chat_history = [] user_text = "" is_voice = False if text_input and text_input.strip(): user_text = text_input is_voice = False elif audio_path: with open(audio_path, "rb") as file: transcription = client_groq.audio.transcriptions.create( file=(audio_path, file.read()), model="whisper-large-v3", temperature=0, response_format="verbose_json", ) user_text = transcription.text is_voice = True else: return None, "", "", chat_history output_path = None model_response_text = "" current_turns = chat_history + [{"role": "user", "parts": [{"text": user_text}]}] config = { "response_modalities": ["AUDIO"], "system_instruction": POR, "enable_affective_dialog": True, "output_audio_transcription": {} } try: async with client_gemini.aio.live.connect(model=MODEL_ID, config=config) as session: await session.send_client_content(turns=current_turns, turn_complete=True) if is_voice: temp_file = tempfile.NamedTemporaryFile(delete=False, suffix=".wav") output_path = temp_file.name with wave.open(output_path, "wb") as wav: wav.setnchannels(1) wav.setsampwidth(2) wav.setframerate(24000) async for response in session.receive(): if response.data: wav.writeframes(response.data) if response.server_content and response.server_content.output_transcription: model_response_text += response.server_content.output_transcription.text else: async for response in session.receive(): if response.server_content and response.server_content.output_transcription: model_response_text += response.server_content.output_transcription.text new_history = current_turns + [{"role": "model", "parts": [{"text": model_response_text}]}] return output_path, user_text, model_response_text, new_history except Exception: return None, user_text, "Error", chat_history def run_interface(audio_file, text_input, chat_history): loop = asyncio.new_event_loop() asyncio.set_event_loop(loop) return loop.run_until_complete(nilla_engine(audio_file, text_input, chat_history)) with gr.Blocks(title=BOT_NAME) as demo: history_component = gr.JSON(value=[], visible=False) with gr.Row(): in_audio = gr.Audio(label="Audio", type="filepath") in_text = gr.Textbox(label="Text") with gr.Row(): out_audio = gr.Audio(label="Voice Resp") out_user = gr.Textbox(label="User Text") out_nilla = gr.Textbox(label="Nilla Text") btn = gr.Button("Process") btn.click( fn=run_interface, inputs=[in_audio, in_text, history_component], outputs=[out_audio, out_user, out_nilla, history_component], api_name="run_interface" ) if __name__ == "__main__": demo.launch()