import cv2
import numpy as np
import gradio as gr
from ultralytics import YOLO
from PIL import Image
import torch
from torchvision import models, transforms

# Load YOLOv8 model for object detection
model = YOLO('yolov8n.pt')

# Load Faster R-CNN model
faster_rcnn = models.detection.fasterrcnn_resnet50_fpn(pretrained=True)
faster_rcnn.eval()

# Define confidence and IOU thresholds
confidence_threshold = 0.25
iou_threshold = 0.45
LOW_RES = (640, 320)

# Function to detect objects and draw bounding boxes
def detect_and_draw(frame):
    low_res_frame = cv2.resize(frame, LOW_RES)
    results = model.predict(source=low_res_frame, conf=confidence_threshold, iou=iou_threshold, verbose=False)
    scale_x = frame.shape[1] / LOW_RES[0]
    scale_y = frame.shape[0] / LOW_RES[1]
    for detection in results[0].boxes:
        x1, y1, x2, y2 = detection.xyxy[0] * np.array([scale_x, scale_y, scale_x, scale_y])
        confidence = detection.conf[0]
        cls_id = int(detection.cls[0])
        label = f"{model.names[cls_id]} {confidence:.2f}"
        cv2.rectangle(frame, (int(x1), int(y1)), (int(x2), int(y2)), (0, 255, 0), 2)
        cv2.putText(frame, label, (int(x1), int(y1) - 10), cv2.FONT_HERSHEY_SIMPLEX, 0.9, (0, 255, 0), 2)
    return frame

# Function to generate captions using Faster R-CNN
def generate_caption(image_frame):
    transform = transforms.Compose([
        transforms.ToTensor()
    ])
    image_tensor = transform(image_frame).unsqueeze(0)

    with torch.no_grad():
        outputs = faster_rcnn(image_tensor)
    
    captions = []
    for box, label, score in zip(outputs[0]['boxes'], outputs[0]['labels'], outputs[0]['scores']):
        if score > confidence_threshold:
            captions.append(f"Object {label} detected with confidence {score:.2f}")
    
    return " ".join(captions)

# Define the stream URL for live video
stream_url = "https://edge01.london.nginx.hdontap.com/hosb5/ng_showcase-coke_bottle-street_fixed.stream/chunklist_w464099566.m3u8"

# Process video stream and generate captions
def process_stream():
    cap = cv2.VideoCapture(stream_url)
    if not cap.isOpened():
        return None
    frame_count = 0
    while cap.isOpened():
        ret, frame = cap.read()
        if not ret:
            break
        frame_count += 1
        if frame_count % 10 == 0:  # Process every 10th frame for efficiency
            result = detect_and_draw(frame)
            caption = generate_caption(frame)
            result_rgb = cv2.cvtColor(result, cv2.COLOR_BGR2RGB)
            print(f"Caption: {caption}")
            yield result_rgb
    cap.release()

# Function to predict and annotate an uploaded image
def predict_image(image):
    results = model.predict(source=image, conf=confidence_threshold)
    annotated_image = results[0].plot()
    object_count = len(results[0].boxes)
    
    # Generate caption for the uploaded image
    caption = generate_caption(image)
    
    return annotated_image, f"Objects detected: {object_count}, Caption: {caption}"

# Gradio interface
with gr.Blocks() as demo:
    gr.Markdown("## YOLOv8 Object Detection with Image Captioning (Faster R-CNN)")
    with gr.Tab("Live Video"):
        gr.Markdown("### Real-Time Object Detection and Captioning from Live Stream")
        live_output = gr.Image(label="Live Video with YOLOv8 Annotations", streaming=True)
        live_output.change(fn=process_stream, inputs=None, outputs=live_output)
    with gr.Tab("Upload Image"):
        gr.Markdown("### Object Detection and Captioning from Uploaded Image")
        uploaded_image = gr.Image(type="numpy", label="Upload Image")
        image_output = gr.Image(label="Annotated Image with YOLOv8 Annotations")
        object_count_image = gr.Textbox(label="Object Count and Caption", interactive=False)
        
        uploaded_image.change(fn=predict_image, inputs=uploaded_image, outputs=[image_output, object_count_image])

# Launch the Gradio interface
if __name__ == "__main__":
    if torch.cuda.is_available():
        model.to('cuda')
    demo.queue()
    demo.launch()