import gradio as gr from transformers import pipeline import time ASR_MODEL = "openai/whisper-tiny" # small/medium if you switch to ZeroGPU ZSC_MODEL = "facebook/bart-large-mnli" # for multilingual use: "MoritzLaurer/mDeBERTa-v3-base-xnli-multilingual-nli-2mil7" asr = pipeline("automatic-speech-recognition", model=ASR_MODEL) zsc = pipeline("zero-shot-classification", model=ZSC_MODEL) DEFAULT_INTENTS = [ "turn_on_lights","turn_off_lights","volume_up","volume_down", "start_music","pause_music","set_timer","cancel_timer", "open_calendar","create_note","start_recording","stop_recording" ] def tool_turn_on_lights(): return "Lights → ON" def tool_turn_off_lights(): return "Lights → OFF" def tool_volume_up(): return "Volume → UP" def tool_volume_down(): return "Volume → DOWN" def tool_start_music(): return "Music → PLAY" def tool_pause_music(): return "Music → PAUSE" def tool_set_timer(): return "Timer → 5 min (demo)" def tool_cancel_timer(): return "Timer → CANCELLED" def tool_open_calendar(): return "Calendar → OPENED" def tool_create_note(text): return f"Note saved: '{text[:60]}'" def tool_start_recording(): return "Recording → STARTED" def tool_stop_recording(): return "Recording → STOPPED" TOOLS = { "turn_on_lights": tool_turn_on_lights, "turn_off_lights": tool_turn_off_lights, "volume_up": tool_volume_up, "volume_down": tool_volume_down, "start_music": tool_start_music, "pause_music": tool_pause_music, "set_timer": tool_set_timer, "cancel_timer": tool_cancel_timer, "open_calendar": tool_open_calendar, "create_note": tool_create_note, "start_recording": tool_start_recording, "stop_recording": tool_stop_recording, } def parse_intents(custom): if not custom or not custom.strip(): return DEFAULT_INTENTS return [t.strip() for t in custom.split(",") if t.strip()] def agent(audio_path, custom_intents, history): if not audio_path: return gr.update(), gr.update(), "No audio.", history transcript = asr(audio_path)["text"].strip() if not transcript: return gr.update(), gr.update(), "No speech detected.", history intents = parse_intents(custom_intents) out = zsc(transcript, candidate_labels=intents, multi_label=False) labels = out["labels"] scores = out["scores"] top3 = {labels[i]: float(scores[i]) for i in range(min(3, len(labels)))} chosen = labels[0] if chosen == "create_note": result = TOOLS[chosen](transcript) else: result = TOOLS.get(chosen, lambda: f"No tool bound: {chosen}")() stamp = time.strftime("%H:%M:%S") history = history + [[f"User: {transcript}", f"Agent: {chosen} → {result}"]] return top3, chosen, result, history with gr.Blocks(title="Voice Agent: ASR → Intent → Tools") as demo: gr.Markdown("# 🎙️ Voice Agent\nSpeak or upload audio → transcript via Whisper → zero-shot intent → tool execution.") with gr.Row(): audio = gr.Audio(sources=["microphone","upload"], type="filepath", label="Audio") intents_box = gr.Textbox(label="Intents (comma-separated)", value=", ".join(DEFAULT_INTENTS)) run = gr.Button("Run") topk = gr.Label(num_top_classes=3, label="Top-k Intents") chosen = gr.Textbox(label="Chosen Intent") result = gr.Textbox(label="Action Result") chat = gr.Chatbot(label="Execution Log") state = gr.State([]) run.click(agent, inputs=[audio, intents_box, state], outputs=[topk, chosen, result, chat], queue=True) if __name__ == "__main__": demo.launch()