import gradio as gr
import onnxruntime as ort
import numpy as np
from PIL import Image, ImageDraw, ImageFont
import time, os
import pyttsx3  # for optional voice alerts

# ---------------------------
# CONFIG
# ---------------------------
MODEL_PATH = "model.onnx"
INPUT_SIZE = (640, 640)
CONF_THRESHOLD_DEFAULT = 0.35

# Initialize voice engine
engine = pyttsx3.init()
engine.setProperty("rate", 180)

# Load model
print(f"Loading ONNX model from: {MODEL_PATH}")
sess = ort.InferenceSession(MODEL_PATH, providers=["CPUExecutionProvider"])
print("\nONNX Model Inputs:")
for i, inp in enumerate(sess.get_inputs()):
    print(f"  Input[{i}] name={inp.name}, shape={inp.shape}, dtype={inp.type}")
print("\nONNX Model Outputs:")
for i, out in enumerate(sess.get_outputs()):
    print(f"  Output[{i}] name={out.name}, shape={out.shape}, dtype={out.type}")

# Preprocess
def preprocess_frame(frame_np):
    img = Image.fromarray(frame_np.astype("uint8"), "RGB")
    img_resized = img.resize(INPUT_SIZE)
    arr = np.array(img_resized).astype(np.float32) / 255.0
    arr = np.transpose(arr, (2, 0, 1))[np.newaxis, ...]  # NCHW
    return arr

# Postprocess
def postprocess_outputs(outputs, orig_w, orig_h, conf_thresh=0.35):
    outs = [np.array(o) for o in outputs]
    cand = outs[0]
    if cand.ndim == 3 and cand.shape[0] == 1:
        cand = cand[0]

    detections = []
    if cand.ndim == 2 and cand.shape[1] >= 6:
        for row in cand:
            cx, cy, w, h = row[0], row[1], row[2], row[3]
            obj_conf = float(row[4])
            class_probs = row[5:]
            best_idx = int(np.argmax(class_probs)) if class_probs.size > 0 else 0
            cls_conf = float(class_probs[best_idx]) if class_probs.size > 0 else 1.0
            score = obj_conf * cls_conf
            if score < conf_thresh:
                continue
            if max(cx, cy, w, h) <= 1.0:
                x1 = (cx - w / 2) * orig_w
                y1 = (cy - h / 2) * orig_h
                x2 = (cx + w / 2) * orig_w
                y2 = (cy + h / 2) * orig_h
            else:
                x1, y1, x2, y2 = cx - w/2, cy - h/2, cx + w/2, cy + h/2
            detections.append({"box": [x1, y1, x2, y2], "score": score, "class": best_idx})
    return detections

# Draw boxes
def draw_boxes_on_image(pil_img, detections):
    img = pil_img.convert("RGB")
    draw = ImageDraw.Draw(img)
    font = ImageFont.load_default()
    for d in detections:
        x1, y1, x2, y2 = d["box"]
        label = f"Class {d['class']}"
        txt = f"{label} {d['score']:.2f}"
        draw.rectangle([x1, y1, x2, y2], outline="red", width=3)
        draw.text((x1, max(0, y1 - 12)), txt, fill="red", font=font)
    return img

# Voice alert
last_spoken = ""
def speak_alert(detections):
    global last_spoken
    if not detections:
        return
    labels_detected = [f"class {d['class']}" for d in detections]
    msg = ", ".join(set(labels_detected))
    if msg != last_spoken:
        engine.say(f"Detected: {msg}")
        engine.runAndWait()
        last_spoken = msg

# Main function
def predict_live(frame, conf_threshold):
    if frame is None:
        return None, "No frame"
    orig_h, orig_w = frame.shape[:2]
    input_tensor = preprocess_frame(frame)
    input_name = sess.get_inputs()[0].name
    outputs = sess.run(None, {input_name: input_tensor})
    detections = postprocess_outputs(outputs, orig_w, orig_h, conf_thresh=conf_threshold)

    pil_img = Image.fromarray(frame.astype("uint8"), "RGB")
    out_img = draw_boxes_on_image(pil_img, detections)

    speak_alert(detections)

    debug_txt = (
        f"Model: {os.path.basename(MODEL_PATH)}\n"
        f"Detections: {len(detections)}"
    )
    return out_img, debug_txt

# Gradio interface with webcam + slider
iface = gr.Interface(
    fn=predict_live,
    inputs=[
        gr.Image(sources=["webcam"], type="numpy", label="Live Camera"),
        gr.Slider(0.05, 0.9, value=CONF_THRESHOLD_DEFAULT, step=0.05, label="Confidence Threshold")
    ],
    outputs=[gr.Image(type="pil"), gr.Textbox(lines=4)],
    live=True,
    title="ONNX Live Camera Detection",
    description="Continuous live detection with bounding boxes + voice alerts"
)

if __name__ == "__main__":
    iface.launch(server_name="0.0.0.0", server_port=7860)