irisbuddy / app.py
monesh2212's picture
Update app.py
2964636 verified
import gradio as gr
import onnxruntime as ort
import numpy as np
from PIL import Image, ImageDraw, ImageFont
import time, os
import pyttsx3 # for optional voice alerts
# ---------------------------
# CONFIG
# ---------------------------
MODEL_PATH = "model.onnx"
INPUT_SIZE = (640, 640)
CONF_THRESHOLD_DEFAULT = 0.35
# Initialize voice engine
engine = pyttsx3.init()
engine.setProperty("rate", 180)
# Load model
print(f"Loading ONNX model from: {MODEL_PATH}")
sess = ort.InferenceSession(MODEL_PATH, providers=["CPUExecutionProvider"])
print("\nONNX Model Inputs:")
for i, inp in enumerate(sess.get_inputs()):
print(f" Input[{i}] name={inp.name}, shape={inp.shape}, dtype={inp.type}")
print("\nONNX Model Outputs:")
for i, out in enumerate(sess.get_outputs()):
print(f" Output[{i}] name={out.name}, shape={out.shape}, dtype={out.type}")
# Preprocess
def preprocess_frame(frame_np):
img = Image.fromarray(frame_np.astype("uint8"), "RGB")
img_resized = img.resize(INPUT_SIZE)
arr = np.array(img_resized).astype(np.float32) / 255.0
arr = np.transpose(arr, (2, 0, 1))[np.newaxis, ...] # NCHW
return arr
# Postprocess
def postprocess_outputs(outputs, orig_w, orig_h, conf_thresh=0.35):
outs = [np.array(o) for o in outputs]
cand = outs[0]
if cand.ndim == 3 and cand.shape[0] == 1:
cand = cand[0]
detections = []
if cand.ndim == 2 and cand.shape[1] >= 6:
for row in cand:
cx, cy, w, h = row[0], row[1], row[2], row[3]
obj_conf = float(row[4])
class_probs = row[5:]
best_idx = int(np.argmax(class_probs)) if class_probs.size > 0 else 0
cls_conf = float(class_probs[best_idx]) if class_probs.size > 0 else 1.0
score = obj_conf * cls_conf
if score < conf_thresh:
continue
if max(cx, cy, w, h) <= 1.0:
x1 = (cx - w / 2) * orig_w
y1 = (cy - h / 2) * orig_h
x2 = (cx + w / 2) * orig_w
y2 = (cy + h / 2) * orig_h
else:
x1, y1, x2, y2 = cx - w/2, cy - h/2, cx + w/2, cy + h/2
detections.append({"box": [x1, y1, x2, y2], "score": score, "class": best_idx})
return detections
# Draw boxes
def draw_boxes_on_image(pil_img, detections):
img = pil_img.convert("RGB")
draw = ImageDraw.Draw(img)
font = ImageFont.load_default()
for d in detections:
x1, y1, x2, y2 = d["box"]
label = f"Class {d['class']}"
txt = f"{label} {d['score']:.2f}"
draw.rectangle([x1, y1, x2, y2], outline="red", width=3)
draw.text((x1, max(0, y1 - 12)), txt, fill="red", font=font)
return img
# Voice alert
last_spoken = ""
def speak_alert(detections):
global last_spoken
if not detections:
return
labels_detected = [f"class {d['class']}" for d in detections]
msg = ", ".join(set(labels_detected))
if msg != last_spoken:
engine.say(f"Detected: {msg}")
engine.runAndWait()
last_spoken = msg
# Main function
def predict_live(frame, conf_threshold):
if frame is None:
return None, "No frame"
orig_h, orig_w = frame.shape[:2]
input_tensor = preprocess_frame(frame)
input_name = sess.get_inputs()[0].name
outputs = sess.run(None, {input_name: input_tensor})
detections = postprocess_outputs(outputs, orig_w, orig_h, conf_thresh=conf_threshold)
pil_img = Image.fromarray(frame.astype("uint8"), "RGB")
out_img = draw_boxes_on_image(pil_img, detections)
speak_alert(detections)
debug_txt = (
f"Model: {os.path.basename(MODEL_PATH)}\n"
f"Detections: {len(detections)}"
)
return out_img, debug_txt
# Gradio interface with webcam + slider
iface = gr.Interface(
fn=predict_live,
inputs=[
gr.Image(sources=["webcam"], type="numpy", label="Live Camera"),
gr.Slider(0.05, 0.9, value=CONF_THRESHOLD_DEFAULT, step=0.05, label="Confidence Threshold")
],
outputs=[gr.Image(type="pil"), gr.Textbox(lines=4)],
live=True,
title="ONNX Live Camera Detection",
description="Continuous live detection with bounding boxes + voice alerts"
)
if __name__ == "__main__":
iface.launch(server_name="0.0.0.0", server_port=7860)