Spaces:

hudaakram
/

Voice_OCR_Agent

Runtime error

App Files Files Community

Voice_OCR_Agent / app.py

hudaakram

Update app.py

01aa62d verified 4 months ago

raw

history blame contribute delete

6.1 kB

	import os, time
	import gradio as gr
	from transformers import pipeline
	import torch

	def make_pipe(task, model_id, fp16_ok=False):
	if torch.cuda.is_available(): # GPU → use device 0
	kwargs = {"device": 0}
	if fp16_ok:
	kwargs["model_kwargs"] = {"torch_dtype": torch.float16}
	else: # CPU → no device arg
	kwargs = {}
	return pipeline(task, model=model_id, **kwargs)

	# examples:
	asr = make_pipe("automatic-speech-recognition", "openai/whisper-tiny", fp16_ok=True)
	zsc = make_pipe("zero-shot-classification", "facebook/bart-large-mnli")
	summarizer = make_pipe("summarization", "sshleifer/distilbart-cnn-12-6", fp16_ok=True)
	ocr = make_pipe("image-to-text", "microsoft/trocr-small-printed", fp16_ok=True)
	qa = make_pipe("question-answering", "deepset/roberta-base-squad2")

	# Lighter defaults (you can override via Space Secrets/Env)
	ASR_MODEL = os.getenv("ASR_MODEL", "openai/whisper-tiny")
	ZSC_MODEL = os.getenv("ZSC_MODEL", "typeform/distilbert-base-uncased-mnli")
	SUM_MODEL = os.getenv("SUM_MODEL", "sshleifer/distilbart-cnn-12-6")
	OCR_MODEL = os.getenv("OCR_MODEL", "microsoft/trocr-small-printed")
	QA_MODEL = os.getenv("QA_MODEL", "distilbert-base-uncased-distilled-squad")

	_asr = _zsc = _summ = _ocr = _qa = None

	def get_asr():
	global _asr
	if _asr is None:
	_asr = pipeline("automatic-speech-recognition", model=ASR_MODEL)
	return _asr

	def get_zsc():
	global _zsc
	if _zsc is None:
	_zsc = pipeline("zero-shot-classification", model=ZSC_MODEL)
	return _zsc

	def get_summarizer():
	global _summ
	if _summ is None:
	_summ = pipeline("summarization", model=SUM_MODEL)
	return _summ

	def get_ocr():
	global _ocr
	if _ocr is None:
	_ocr = pipeline("image-to-text", model=OCR_MODEL)
	return _ocr

	def get_qa():
	global _qa
	if _qa is None:
	_qa = pipeline("question-answering", model=QA_MODEL)
	return _qa

	DEFAULT_INTENTS = [
	"turn_on_lights","turn_off_lights","volume_up","volume_down",
	"start_music","pause_music","set_timer","cancel_timer",
	"open_calendar","create_note","start_recording","stop_recording"
	]

	TOOLS = {
	"turn_on_lights": lambda: "Lights → ON",
	"turn_off_lights": lambda: "Lights → OFF",
	"volume_up": lambda: "Volume → UP",
	"volume_down": lambda: "Volume → DOWN",
	"start_music": lambda: "Music → PLAY",
	"pause_music": lambda: "Music → PAUSE",
	"set_timer": lambda: "Timer → 5 min (demo)",
	"cancel_timer": lambda: "Timer → CANCELLED",
	"open_calendar": lambda: "Calendar → OPENED",
	"create_note": lambda text="": f"Note saved: '{text[:60]}'",
	"start_recording": lambda: "Recording → STARTED",
	"stop_recording": lambda: "Recording → STOPPED",
	}

	def parse_intents(custom):
	if not custom or not custom.strip():
	return DEFAULT_INTENTS
	return [t.strip() for t in custom.split(",") if t.strip()]

	def agent(audio_path, custom_intents, history):
	if not audio_path:
	return gr.update(), gr.update(), "No audio.", history
	asr = get_asr()
	zsc = get_zsc()
	transcript = asr(audio_path)["text"].strip()
	if not transcript:
	return gr.update(), gr.update(), "No speech detected.", history
	intents = parse_intents(custom_intents)
	out = zsc(transcript, candidate_labels=intents, multi_label=False)
	labels, scores = out["labels"], out["scores"]
	top3 = {labels[i]: float(scores[i]) for i in range(min(3, len(labels)))}
	chosen = labels[0]
	result = (TOOLS[chosen](transcript) if chosen == "create_note"
	else TOOLS.get(chosen, lambda: f"No tool bound: {chosen}")())
	stamp = time.strftime("%H:%M:%S")
	history = history + [[f"User: {transcript}", f"{stamp} • {chosen} → {result}"]]
	return top3, chosen, result, history

	def do_ocr(image):
	if image is None:
	return "", ""
	ocr = get_ocr()
	summarizer = get_summarizer()
	text = ocr(image)[0]["generated_text"]
	if not text.strip():
	return "", ""
	chunk = text[:3000]
	summary = summarizer(chunk, max_length=120, min_length=30, do_sample=False)[0]["summary_text"]
	return text, summary

	def ask_qa(context_text, question):
	if not context_text or not question:
	return ""
	qa = get_qa()
	return qa({"context": context_text, "question": question}).get("answer", "")

	with gr.Blocks(title="Multimodal Voice & OCR Agent") as demo:
	gr.Markdown("## 🎤🧾 Multimodal Voice & OCR Agent\nUses pre-trained models only. Models are loaded lazily per tab to reduce RAM.")

	with gr.Tabs():
	with gr.Tab("Voice Agent"):
	with gr.Row():
	audio = gr.Audio(sources=["microphone","upload"], type="filepath", label="Audio")
	intents_box = gr.Textbox(label="Intents (comma-separated)", value=", ".join(DEFAULT_INTENTS))
	run = gr.Button("Run")
	topk = gr.Label(num_top_classes=3, label="Top-k Intents")
	chosen = gr.Textbox(label="Chosen Intent")
	result = gr.Textbox(label="Action Result")
	chat = gr.Chatbot(label="Execution Log")
	state = gr.State([])
	run.click(agent, inputs=[audio, intents_box, state], outputs=[topk, chosen, result, chat], queue=True)

	with gr.Tab("OCR + Summarize + QA"):
	img = gr.Image(type="filepath", label="Upload an image / screenshot / page")
	ocr_btn = gr.Button("Extract text + Summarize")
	ocr_text = gr.Textbox(label="OCR Text", lines=10)
	ocr_sum = gr.Textbox(label="Summary", lines=6)
	with gr.Row():
	question = gr.Textbox(label="Ask a question about the OCR text")
	qa_btn = gr.Button("Answer")
	answer = gr.Textbox(label="Answer")
	ocr_btn.click(do_ocr, inputs=img, outputs=[ocr_text, ocr_sum])
	qa_btn.click(ask_qa, inputs=[ocr_text, question], outputs=answer)