HumanV / app.py
humanvprojectceo's picture
Update app.py
20c1db9 verified
import os
import asyncio
import wave
import gradio as gr
import tempfile
from groq import Groq
from google import genai
from google.genai import types
BOT_NAME = "Nilla"
MOTOR_NAME = "Nilla-2026 GPT motor"
PROVIDER = "HumanV lab"
POR = os.environ.get("POR")
MODEL_ID = os.environ.get("MODEL_VERSION")
UK_SERVER_API = os.environ.get("UK_SERVER_API")
GROQ_API_KEY = os.environ.get("GROQ_API_KEY")
client_gemini = genai.Client(
api_key=UK_SERVER_API,
http_options={"api_version": "v1alpha"}
)
client_groq = Groq(api_key=GROQ_API_KEY)
async def nilla_engine(audio_path, text_input, chat_history):
if chat_history is None:
chat_history = []
user_text = ""
is_voice = False
if text_input and text_input.strip():
user_text = text_input
is_voice = False
elif audio_path:
with open(audio_path, "rb") as file:
transcription = client_groq.audio.transcriptions.create(
file=(audio_path, file.read()),
model="whisper-large-v3",
temperature=0,
response_format="verbose_json",
)
user_text = transcription.text
is_voice = True
else:
return None, "", "", chat_history
output_path = None
model_response_text = ""
current_turns = chat_history + [{"role": "user", "parts": [{"text": user_text}]}]
config = {
"response_modalities": ["AUDIO"],
"system_instruction": POR,
"enable_affective_dialog": True,
"output_audio_transcription": {}
}
try:
async with client_gemini.aio.live.connect(model=MODEL_ID, config=config) as session:
await session.send_client_content(turns=current_turns, turn_complete=True)
if is_voice:
temp_file = tempfile.NamedTemporaryFile(delete=False, suffix=".wav")
output_path = temp_file.name
with wave.open(output_path, "wb") as wav:
wav.setnchannels(1)
wav.setsampwidth(2)
wav.setframerate(24000)
async for response in session.receive():
if response.data:
wav.writeframes(response.data)
if response.server_content and response.server_content.output_transcription:
model_response_text += response.server_content.output_transcription.text
else:
async for response in session.receive():
if response.server_content and response.server_content.output_transcription:
model_response_text += response.server_content.output_transcription.text
new_history = current_turns + [{"role": "model", "parts": [{"text": model_response_text}]}]
return output_path, user_text, model_response_text, new_history
except Exception:
return None, user_text, "Error", chat_history
def run_interface(audio_file, text_input, chat_history):
loop = asyncio.new_event_loop()
asyncio.set_event_loop(loop)
return loop.run_until_complete(nilla_engine(audio_file, text_input, chat_history))
with gr.Blocks(title=BOT_NAME) as demo:
history_component = gr.JSON(value=[], visible=False)
with gr.Row():
in_audio = gr.Audio(label="Audio", type="filepath")
in_text = gr.Textbox(label="Text")
with gr.Row():
out_audio = gr.Audio(label="Voice Resp")
out_user = gr.Textbox(label="User Text")
out_nilla = gr.Textbox(label="Nilla Text")
btn = gr.Button("Process")
btn.click(
fn=run_interface,
inputs=[in_audio, in_text, history_component],
outputs=[out_audio, out_user, out_nilla, history_component],
api_name="run_interface"
)
if __name__ == "__main__":
demo.launch()