Spaces:

hudaakram
/

Voice_Agent

Sleeping

File size: 3,644 Bytes

5590cf9

import gradio as gr
from transformers import pipeline
import time

ASR_MODEL = "openai/whisper-tiny"  # small/medium if you switch to ZeroGPU
ZSC_MODEL = "facebook/bart-large-mnli"  # for multilingual use: "MoritzLaurer/mDeBERTa-v3-base-xnli-multilingual-nli-2mil7"

asr = pipeline("automatic-speech-recognition", model=ASR_MODEL)
zsc = pipeline("zero-shot-classification", model=ZSC_MODEL)

DEFAULT_INTENTS = [
    "turn_on_lights","turn_off_lights","volume_up","volume_down",
    "start_music","pause_music","set_timer","cancel_timer",
    "open_calendar","create_note","start_recording","stop_recording"
]

def tool_turn_on_lights():
    return "Lights → ON"

def tool_turn_off_lights():
    return "Lights → OFF"

def tool_volume_up():
    return "Volume → UP"

def tool_volume_down():
    return "Volume → DOWN"

def tool_start_music():
    return "Music → PLAY"

def tool_pause_music():
    return "Music → PAUSE"

def tool_set_timer():
    return "Timer → 5 min (demo)"

def tool_cancel_timer():
    return "Timer → CANCELLED"

def tool_open_calendar():
    return "Calendar → OPENED"

def tool_create_note(text):
    return f"Note saved: '{text[:60]}'"

def tool_start_recording():
    return "Recording → STARTED"

def tool_stop_recording():
    return "Recording → STOPPED"

TOOLS = {
    "turn_on_lights": tool_turn_on_lights,
    "turn_off_lights": tool_turn_off_lights,
    "volume_up": tool_volume_up,
    "volume_down": tool_volume_down,
    "start_music": tool_start_music,
    "pause_music": tool_pause_music,
    "set_timer": tool_set_timer,
    "cancel_timer": tool_cancel_timer,
    "open_calendar": tool_open_calendar,
    "create_note": tool_create_note,
    "start_recording": tool_start_recording,
    "stop_recording": tool_stop_recording,
}

def parse_intents(custom):
    if not custom or not custom.strip():
        return DEFAULT_INTENTS
    return [t.strip() for t in custom.split(",") if t.strip()]

def agent(audio_path, custom_intents, history):
    if not audio_path:
        return gr.update(), gr.update(), "No audio.", history
    transcript = asr(audio_path)["text"].strip()
    if not transcript:
        return gr.update(), gr.update(), "No speech detected.", history

    intents = parse_intents(custom_intents)
    out = zsc(transcript, candidate_labels=intents, multi_label=False)
    labels = out["labels"]
    scores = out["scores"]
    top3 = {labels[i]: float(scores[i]) for i in range(min(3, len(labels)))}

    chosen = labels[0]
    if chosen == "create_note":
        result = TOOLS[chosen](transcript)
    else:
        result = TOOLS.get(chosen, lambda: f"No tool bound: {chosen}")()

    stamp = time.strftime("%H:%M:%S")
    history = history + [[f"User: {transcript}", f"Agent: {chosen} → {result}"]]
    return top3, chosen, result, history

with gr.Blocks(title="Voice Agent: ASR → Intent → Tools") as demo:
    gr.Markdown("# 🎙️ Voice Agent\nSpeak or upload audio → transcript via Whisper → zero-shot intent → tool execution.")
    with gr.Row():
        audio = gr.Audio(sources=["microphone","upload"], type="filepath", label="Audio")
        intents_box = gr.Textbox(label="Intents (comma-separated)", value=", ".join(DEFAULT_INTENTS))
    run = gr.Button("Run")
    topk = gr.Label(num_top_classes=3, label="Top-k Intents")
    chosen = gr.Textbox(label="Chosen Intent")
    result = gr.Textbox(label="Action Result")
    chat = gr.Chatbot(label="Execution Log")
    state = gr.State([])

    run.click(agent, inputs=[audio, intents_box, state], outputs=[topk, chosen, result, chat], queue=True)

if __name__ == "__main__":
    demo.launch()