Spaces:

hudaakram
/

Voice_OCR_Agent

Runtime error

File size: 6,099 Bytes

import os, time
import gradio as gr
from transformers import pipeline
import torch

def make_pipe(task, model_id, fp16_ok=False):
    if torch.cuda.is_available():          # GPU → use device 0
        kwargs = {"device": 0}
        if fp16_ok:
            kwargs["model_kwargs"] = {"torch_dtype": torch.float16}
    else:                                   # CPU → no device arg
        kwargs = {}
    return pipeline(task, model=model_id, **kwargs)

# examples:
asr         = make_pipe("automatic-speech-recognition", "openai/whisper-tiny", fp16_ok=True)
zsc         = make_pipe("zero-shot-classification",     "facebook/bart-large-mnli")
summarizer  = make_pipe("summarization",                 "sshleifer/distilbart-cnn-12-6", fp16_ok=True)
ocr         = make_pipe("image-to-text",                 "microsoft/trocr-small-printed", fp16_ok=True)
qa          = make_pipe("question-answering",            "deepset/roberta-base-squad2")

# Lighter defaults (you can override via Space Secrets/Env)
ASR_MODEL = os.getenv("ASR_MODEL", "openai/whisper-tiny")
ZSC_MODEL = os.getenv("ZSC_MODEL", "typeform/distilbert-base-uncased-mnli")
SUM_MODEL = os.getenv("SUM_MODEL", "sshleifer/distilbart-cnn-12-6")
OCR_MODEL = os.getenv("OCR_MODEL", "microsoft/trocr-small-printed")
QA_MODEL  = os.getenv("QA_MODEL",  "distilbert-base-uncased-distilled-squad")

_asr = _zsc = _summ = _ocr = _qa = None

def get_asr():
    global _asr
    if _asr is None:
        _asr = pipeline("automatic-speech-recognition", model=ASR_MODEL)
    return _asr

def get_zsc():
    global _zsc
    if _zsc is None:
        _zsc = pipeline("zero-shot-classification", model=ZSC_MODEL)
    return _zsc

def get_summarizer():
    global _summ
    if _summ is None:
        _summ = pipeline("summarization", model=SUM_MODEL)
    return _summ

def get_ocr():
    global _ocr
    if _ocr is None:
        _ocr = pipeline("image-to-text", model=OCR_MODEL)
    return _ocr

def get_qa():
    global _qa
    if _qa is None:
        _qa = pipeline("question-answering", model=QA_MODEL)
    return _qa

DEFAULT_INTENTS = [
    "turn_on_lights","turn_off_lights","volume_up","volume_down",
    "start_music","pause_music","set_timer","cancel_timer",
    "open_calendar","create_note","start_recording","stop_recording"
]

TOOLS = {
    "turn_on_lights": lambda: "Lights → ON",
    "turn_off_lights": lambda: "Lights → OFF",
    "volume_up": lambda: "Volume → UP",
    "volume_down": lambda: "Volume → DOWN",
    "start_music": lambda: "Music → PLAY",
    "pause_music": lambda: "Music → PAUSE",
    "set_timer": lambda: "Timer → 5 min (demo)",
    "cancel_timer": lambda: "Timer → CANCELLED",
    "open_calendar": lambda: "Calendar → OPENED",
    "create_note": lambda text="": f"Note saved: '{text[:60]}'",
    "start_recording": lambda: "Recording → STARTED",
    "stop_recording": lambda: "Recording → STOPPED",
}

def parse_intents(custom):
    if not custom or not custom.strip():
        return DEFAULT_INTENTS
    return [t.strip() for t in custom.split(",") if t.strip()]

def agent(audio_path, custom_intents, history):
    if not audio_path:
        return gr.update(), gr.update(), "No audio.", history
    asr = get_asr()
    zsc = get_zsc()
    transcript = asr(audio_path)["text"].strip()
    if not transcript:
        return gr.update(), gr.update(), "No speech detected.", history
    intents = parse_intents(custom_intents)
    out = zsc(transcript, candidate_labels=intents, multi_label=False)
    labels, scores = out["labels"], out["scores"]
    top3 = {labels[i]: float(scores[i]) for i in range(min(3, len(labels)))}
    chosen = labels[0]
    result = (TOOLS[chosen](transcript) if chosen == "create_note"
              else TOOLS.get(chosen, lambda: f"No tool bound: {chosen}")())
    stamp = time.strftime("%H:%M:%S")
    history = history + [[f"User: {transcript}", f"{stamp} • {chosen} → {result}"]]
    return top3, chosen, result, history

def do_ocr(image):
    if image is None:
        return "", ""
    ocr = get_ocr()
    summarizer = get_summarizer()
    text = ocr(image)[0]["generated_text"]
    if not text.strip():
        return "", ""
    chunk = text[:3000]
    summary = summarizer(chunk, max_length=120, min_length=30, do_sample=False)[0]["summary_text"]
    return text, summary

def ask_qa(context_text, question):
    if not context_text or not question:
        return ""
    qa = get_qa()
    return qa({"context": context_text, "question": question}).get("answer", "")

with gr.Blocks(title="Multimodal Voice & OCR Agent") as demo:
    gr.Markdown("## 🎤🧾 Multimodal Voice & OCR Agent\nUses **pre-trained models** only. Models are loaded lazily per tab to reduce RAM.")

    with gr.Tabs():
        with gr.Tab("Voice Agent"):
            with gr.Row():
                audio = gr.Audio(sources=["microphone","upload"], type="filepath", label="Audio")
                intents_box = gr.Textbox(label="Intents (comma-separated)", value=", ".join(DEFAULT_INTENTS))
            run = gr.Button("Run")
            topk = gr.Label(num_top_classes=3, label="Top-k Intents")
            chosen = gr.Textbox(label="Chosen Intent")
            result = gr.Textbox(label="Action Result")
            chat = gr.Chatbot(label="Execution Log")
            state = gr.State([])
            run.click(agent, inputs=[audio, intents_box, state], outputs=[topk, chosen, result, chat], queue=True)

        with gr.Tab("OCR + Summarize + QA"):
            img = gr.Image(type="filepath", label="Upload an image / screenshot / page")
            ocr_btn = gr.Button("Extract text + Summarize")
            ocr_text = gr.Textbox(label="OCR Text", lines=10)
            ocr_sum  = gr.Textbox(label="Summary", lines=6)
            with gr.Row():
                question = gr.Textbox(label="Ask a question about the OCR text")
                qa_btn   = gr.Button("Answer")
                answer   = gr.Textbox(label="Answer")
            ocr_btn.click(do_ocr, inputs=img, outputs=[ocr_text, ocr_sum])
            qa_btn.click(ask_qa, inputs=[ocr_text, question], outputs=answer)