Spaces:
Sleeping
Sleeping
| import gradio as gr | |
| from transformers import pipeline | |
| import time | |
| ASR_MODEL = "openai/whisper-tiny" # small/medium if you switch to ZeroGPU | |
| ZSC_MODEL = "facebook/bart-large-mnli" # for multilingual use: "MoritzLaurer/mDeBERTa-v3-base-xnli-multilingual-nli-2mil7" | |
| asr = pipeline("automatic-speech-recognition", model=ASR_MODEL) | |
| zsc = pipeline("zero-shot-classification", model=ZSC_MODEL) | |
| DEFAULT_INTENTS = [ | |
| "turn_on_lights","turn_off_lights","volume_up","volume_down", | |
| "start_music","pause_music","set_timer","cancel_timer", | |
| "open_calendar","create_note","start_recording","stop_recording" | |
| ] | |
| def tool_turn_on_lights(): | |
| return "Lights → ON" | |
| def tool_turn_off_lights(): | |
| return "Lights → OFF" | |
| def tool_volume_up(): | |
| return "Volume → UP" | |
| def tool_volume_down(): | |
| return "Volume → DOWN" | |
| def tool_start_music(): | |
| return "Music → PLAY" | |
| def tool_pause_music(): | |
| return "Music → PAUSE" | |
| def tool_set_timer(): | |
| return "Timer → 5 min (demo)" | |
| def tool_cancel_timer(): | |
| return "Timer → CANCELLED" | |
| def tool_open_calendar(): | |
| return "Calendar → OPENED" | |
| def tool_create_note(text): | |
| return f"Note saved: '{text[:60]}'" | |
| def tool_start_recording(): | |
| return "Recording → STARTED" | |
| def tool_stop_recording(): | |
| return "Recording → STOPPED" | |
| TOOLS = { | |
| "turn_on_lights": tool_turn_on_lights, | |
| "turn_off_lights": tool_turn_off_lights, | |
| "volume_up": tool_volume_up, | |
| "volume_down": tool_volume_down, | |
| "start_music": tool_start_music, | |
| "pause_music": tool_pause_music, | |
| "set_timer": tool_set_timer, | |
| "cancel_timer": tool_cancel_timer, | |
| "open_calendar": tool_open_calendar, | |
| "create_note": tool_create_note, | |
| "start_recording": tool_start_recording, | |
| "stop_recording": tool_stop_recording, | |
| } | |
| def parse_intents(custom): | |
| if not custom or not custom.strip(): | |
| return DEFAULT_INTENTS | |
| return [t.strip() for t in custom.split(",") if t.strip()] | |
| def agent(audio_path, custom_intents, history): | |
| if not audio_path: | |
| return gr.update(), gr.update(), "No audio.", history | |
| transcript = asr(audio_path)["text"].strip() | |
| if not transcript: | |
| return gr.update(), gr.update(), "No speech detected.", history | |
| intents = parse_intents(custom_intents) | |
| out = zsc(transcript, candidate_labels=intents, multi_label=False) | |
| labels = out["labels"] | |
| scores = out["scores"] | |
| top3 = {labels[i]: float(scores[i]) for i in range(min(3, len(labels)))} | |
| chosen = labels[0] | |
| if chosen == "create_note": | |
| result = TOOLS[chosen](transcript) | |
| else: | |
| result = TOOLS.get(chosen, lambda: f"No tool bound: {chosen}")() | |
| stamp = time.strftime("%H:%M:%S") | |
| history = history + [[f"User: {transcript}", f"Agent: {chosen} → {result}"]] | |
| return top3, chosen, result, history | |
| with gr.Blocks(title="Voice Agent: ASR → Intent → Tools") as demo: | |
| gr.Markdown("# 🎙️ Voice Agent\nSpeak or upload audio → transcript via Whisper → zero-shot intent → tool execution.") | |
| with gr.Row(): | |
| audio = gr.Audio(sources=["microphone","upload"], type="filepath", label="Audio") | |
| intents_box = gr.Textbox(label="Intents (comma-separated)", value=", ".join(DEFAULT_INTENTS)) | |
| run = gr.Button("Run") | |
| topk = gr.Label(num_top_classes=3, label="Top-k Intents") | |
| chosen = gr.Textbox(label="Chosen Intent") | |
| result = gr.Textbox(label="Action Result") | |
| chat = gr.Chatbot(label="Execution Log") | |
| state = gr.State([]) | |
| run.click(agent, inputs=[audio, intents_box, state], outputs=[topk, chosen, result, chat], queue=True) | |
| if __name__ == "__main__": | |
| demo.launch() | |