import gradio as gr
import cv2
import torch
import os
import numpy as np
import torchvision
from torchvision.models.detection import FasterRCNN
from torchvision.models.detection.faster_rcnn import FastRCNNPredictor

# Load Models
def load_model( backbone_name, num_classes):
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    if backbone_name == "resnet50":
        model = torchvision.models.detection.fasterrcnn_resnet50_fpn(pretrained=False)
        in_features = model.roi_heads.box_predictor.cls_score.in_features
        model.roi_heads.box_predictor = FastRCNNPredictor(in_features, num_classes)
        model.load_state_dict(torch.load("fasterrcnnResnet.pth", map_location=device))
    elif backbone_name == "mobilenet":
        model = torchvision.models.detection.fasterrcnn_mobilenet_v3_large_fpn(pretrained=False)
        in_features = model.roi_heads.box_predictor.cls_score.in_features
        model.roi_heads.box_predictor = FastRCNNPredictor(in_features, num_classes)
        model.load_state_dict(torch.load("fasterrcnnMobilenet.pth", map_location=device))
    model.to(device)
    model.eval()
    return model


class_names = ['background', 'Ambulance', 'Bus', 'Car', 'Motorcycle', 'Truck']

# Inference Function for Images and Videos

def predict_image(image_path, model):
    image = cv2.imread(image_path)
    image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
    image_tensor = torch.tensor(image / 255.0).permute(2, 0, 1).float().unsqueeze(0)
    with torch.no_grad():
        output = model(image_tensor)[0]
    for box, label, score in zip(output['boxes'], output['labels'], output['scores']):
        if score > 0.5:
            x1, y1, x2, y2 = map(int, box.tolist())
            cv2.rectangle(image, (x1, y1), (x2, y2), (0, 255, 0), 2)
            cv2.putText(image, f"{class_names[label]}: {score:.2f}", (x1, y1 - 10), cv2.FONT_HERSHEY_SIMPLEX, 0.5, (0, 255, 0), 1)
    return image


def predict_video(video_path, model):
    cap = cv2.VideoCapture(video_path)
    frames = []
    while cap.isOpened():
        ret, frame = cap.read()
        if not ret:
            break
        frame_tensor = torch.tensor(frame / 255.0).permute(2, 0, 1).float().unsqueeze(0)
        with torch.no_grad():
            output = model(frame_tensor)[0]
        for box, label, score in zip(output['boxes'], output['labels'], output['scores']):
            if score > 0.5:
                x1, y1, x2, y2 = map(int, box.tolist())
                cv2.rectangle(frame, (x1, y1), (x2, y2), (0, 255, 0), 2)
                cv2.putText(frame, f"{class_names[label]}: {score:.2f}", (x1, y1 - 10), cv2.FONT_HERSHEY_SIMPLEX, 0.5, (0, 255, 0), 1)
        frames.append(frame)
    cap.release()
    output_path = 'output_video.mp4'
    height, width, _ = frames[0].shape
    out = cv2.VideoWriter(output_path, cv2.VideoWriter_fourcc(*'mp4v'), 20, (width, height))

    for frame in frames:
        out.write(frame)

    out.release()
    return output_path

# Gradio Interface for Image and Video Inference

model_selection = gr.Dropdown(choices=["ResNet50", "MobileNet"], label="Select Model")

inputs_image = [gr.Image(type="filepath", label="Upload Image"), model_selection]
outputs_image = gr.Image(type="numpy", label="Detection Output")

inputs_video = [gr.Video(label="Upload Video"), model_selection]
outputs_video = gr.Video(label="Detection Output")


with gr.Blocks() as demo:
    with gr.TabItem("Image"):
        gr.Interface(
            fn=lambda img, model_name: predict_image(img, load_model( model_name.lower(), num_classes=6)),
            inputs=inputs_image,
            outputs=outputs_image,
            title="Image Inference"
        )

    with gr.TabItem("Video"):
        gr.Interface(
            fn=lambda vid, model_name: predict_video(vid, load_model( model_name.lower(), num_classes=6)),
            inputs=inputs_video,
            outputs=outputs_video,
            title="Video Inference"
        )

    demo.launch()