File size: 4,282 Bytes
a836cf6
 
 
 
 
2964636
a836cf6
 
 
 
2964636
 
 
a836cf6
2964636
 
 
a836cf6
2964636
a836cf6
 
 
 
2964636
a836cf6
 
2964636
a836cf6
2964636
 
a836cf6
2964636
 
 
a836cf6
 
2964636
 
 
 
a836cf6
 
 
 
 
2964636
 
 
 
a836cf6
 
 
 
 
 
2964636
 
 
 
a836cf6
 
 
 
 
2964636
a836cf6
 
 
 
 
 
2964636
a836cf6
 
 
 
 
2964636
 
 
 
 
 
 
 
 
 
 
 
 
 
 
a836cf6
 
2964636
 
a836cf6
2964636
 
a836cf6
 
 
 
2964636
 
a836cf6
 
2964636
a836cf6
 
 
2964636
a836cf6
 
2964636
 
 
 
 
a836cf6
2964636
 
a836cf6
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
import gradio as gr
import onnxruntime as ort
import numpy as np
from PIL import Image, ImageDraw, ImageFont
import time, os
import pyttsx3  # for optional voice alerts

# ---------------------------
# CONFIG
# ---------------------------
MODEL_PATH = "model.onnx"
INPUT_SIZE = (640, 640)
CONF_THRESHOLD_DEFAULT = 0.35

# Initialize voice engine
engine = pyttsx3.init()
engine.setProperty("rate", 180)

# Load model
print(f"Loading ONNX model from: {MODEL_PATH}")
sess = ort.InferenceSession(MODEL_PATH, providers=["CPUExecutionProvider"])
print("\nONNX Model Inputs:")
for i, inp in enumerate(sess.get_inputs()):
    print(f"  Input[{i}] name={inp.name}, shape={inp.shape}, dtype={inp.type}")
print("\nONNX Model Outputs:")
for i, out in enumerate(sess.get_outputs()):
    print(f"  Output[{i}] name={out.name}, shape={out.shape}, dtype={out.type}")

# Preprocess
def preprocess_frame(frame_np):
    img = Image.fromarray(frame_np.astype("uint8"), "RGB")
    img_resized = img.resize(INPUT_SIZE)
    arr = np.array(img_resized).astype(np.float32) / 255.0
    arr = np.transpose(arr, (2, 0, 1))[np.newaxis, ...]  # NCHW
    return arr

# Postprocess
def postprocess_outputs(outputs, orig_w, orig_h, conf_thresh=0.35):
    outs = [np.array(o) for o in outputs]
    cand = outs[0]
    if cand.ndim == 3 and cand.shape[0] == 1:
        cand = cand[0]

    detections = []
    if cand.ndim == 2 and cand.shape[1] >= 6:
        for row in cand:
            cx, cy, w, h = row[0], row[1], row[2], row[3]
            obj_conf = float(row[4])
            class_probs = row[5:]
            best_idx = int(np.argmax(class_probs)) if class_probs.size > 0 else 0
            cls_conf = float(class_probs[best_idx]) if class_probs.size > 0 else 1.0
            score = obj_conf * cls_conf
            if score < conf_thresh:
                continue
            if max(cx, cy, w, h) <= 1.0:
                x1 = (cx - w / 2) * orig_w
                y1 = (cy - h / 2) * orig_h
                x2 = (cx + w / 2) * orig_w
                y2 = (cy + h / 2) * orig_h
            else:
                x1, y1, x2, y2 = cx - w/2, cy - h/2, cx + w/2, cy + h/2
            detections.append({"box": [x1, y1, x2, y2], "score": score, "class": best_idx})
    return detections

# Draw boxes
def draw_boxes_on_image(pil_img, detections):
    img = pil_img.convert("RGB")
    draw = ImageDraw.Draw(img)
    font = ImageFont.load_default()
    for d in detections:
        x1, y1, x2, y2 = d["box"]
        label = f"Class {d['class']}"
        txt = f"{label} {d['score']:.2f}"
        draw.rectangle([x1, y1, x2, y2], outline="red", width=3)
        draw.text((x1, max(0, y1 - 12)), txt, fill="red", font=font)
    return img

# Voice alert
last_spoken = ""
def speak_alert(detections):
    global last_spoken
    if not detections:
        return
    labels_detected = [f"class {d['class']}" for d in detections]
    msg = ", ".join(set(labels_detected))
    if msg != last_spoken:
        engine.say(f"Detected: {msg}")
        engine.runAndWait()
        last_spoken = msg

# Main function
def predict_live(frame, conf_threshold):
    if frame is None:
        return None, "No frame"
    orig_h, orig_w = frame.shape[:2]
    input_tensor = preprocess_frame(frame)
    input_name = sess.get_inputs()[0].name
    outputs = sess.run(None, {input_name: input_tensor})
    detections = postprocess_outputs(outputs, orig_w, orig_h, conf_thresh=conf_threshold)

    pil_img = Image.fromarray(frame.astype("uint8"), "RGB")
    out_img = draw_boxes_on_image(pil_img, detections)

    speak_alert(detections)

    debug_txt = (
        f"Model: {os.path.basename(MODEL_PATH)}\n"
        f"Detections: {len(detections)}"
    )
    return out_img, debug_txt

# Gradio interface with webcam + slider
iface = gr.Interface(
    fn=predict_live,
    inputs=[
        gr.Image(sources=["webcam"], type="numpy", label="Live Camera"),
        gr.Slider(0.05, 0.9, value=CONF_THRESHOLD_DEFAULT, step=0.05, label="Confidence Threshold")
    ],
    outputs=[gr.Image(type="pil"), gr.Textbox(lines=4)],
    live=True,
    title="ONNX Live Camera Detection",
    description="Continuous live detection with bounding boxes + voice alerts"
)

if __name__ == "__main__":
    iface.launch(server_name="0.0.0.0", server_port=7860)