File size: 4,064 Bytes
6e11525
c690171
0e55c4c
c690171
6f8ba8b
 
a9c8ff8
0e55c4c
9fb5431
336db70
152382c
a9c8ff8
 
 
 
6f8ba8b
 
 
 
 
9fb5431
6e11525
6f8ba8b
 
 
 
c690171
6f8ba8b
c690171
 
 
6f8ba8b
 
6e11525
abf7f16
8d6fec1
 
a9c8ff8
 
 
 
abf7f16
a9c8ff8
 
 
 
 
 
8d6fec1
07856ef
8d6fec1
ee1b855
6f8ba8b
 
 
 
 
 
b596a08
6f8ba8b
 
 
b596a08
 
 
6f8ba8b
 
 
8d6fec1
6f8ba8b
 
 
b596a08
 
6f8ba8b
 
 
 
 
 
 
8d6fec1
6f8ba8b
 
 
9fb5431
3cb8dc9
a9c8ff8
6f8ba8b
 
 
 
 
 
 
 
 
 
 
558e5db
3cb8dc9
6e11525
6f8ba8b
 
 
a1f4075
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
import cv2
import numpy as np
import gradio as gr
from ultralytics import YOLO
from PIL import Image
import torch
from torchvision import models, transforms

# Load YOLOv8 model for object detection
model = YOLO('yolov8n.pt')

# Load Faster R-CNN model
faster_rcnn = models.detection.fasterrcnn_resnet50_fpn(pretrained=True)
faster_rcnn.eval()

# Define confidence and IOU thresholds
confidence_threshold = 0.25
iou_threshold = 0.45
LOW_RES = (640, 320)

# Function to detect objects and draw bounding boxes
def detect_and_draw(frame):
    low_res_frame = cv2.resize(frame, LOW_RES)
    results = model.predict(source=low_res_frame, conf=confidence_threshold, iou=iou_threshold, verbose=False)
    scale_x = frame.shape[1] / LOW_RES[0]
    scale_y = frame.shape[0] / LOW_RES[1]
    for detection in results[0].boxes:
        x1, y1, x2, y2 = detection.xyxy[0] * np.array([scale_x, scale_y, scale_x, scale_y])
        confidence = detection.conf[0]
        cls_id = int(detection.cls[0])
        label = f"{model.names[cls_id]} {confidence:.2f}"
        cv2.rectangle(frame, (int(x1), int(y1)), (int(x2), int(y2)), (0, 255, 0), 2)
        cv2.putText(frame, label, (int(x1), int(y1) - 10), cv2.FONT_HERSHEY_SIMPLEX, 0.9, (0, 255, 0), 2)
    return frame

# Function to generate captions using Faster R-CNN
def generate_caption(image_frame):
    transform = transforms.Compose([
        transforms.ToTensor()
    ])
    image_tensor = transform(image_frame).unsqueeze(0)

    with torch.no_grad():
        outputs = faster_rcnn(image_tensor)
    
    captions = []
    for box, label, score in zip(outputs[0]['boxes'], outputs[0]['labels'], outputs[0]['scores']):
        if score > confidence_threshold:
            captions.append(f"Object {label} detected with confidence {score:.2f}")
    
    return " ".join(captions)

# Define the stream URL for live video
stream_url = "https://edge01.london.nginx.hdontap.com/hosb5/ng_showcase-coke_bottle-street_fixed.stream/chunklist_w464099566.m3u8"

# Process video stream and generate captions
def process_stream():
    cap = cv2.VideoCapture(stream_url)
    if not cap.isOpened():
        return None
    frame_count = 0
    while cap.isOpened():
        ret, frame = cap.read()
        if not ret:
            break
        frame_count += 1
        if frame_count % 10 == 0:  # Process every 10th frame for efficiency
            result = detect_and_draw(frame)
            caption = generate_caption(frame)
            result_rgb = cv2.cvtColor(result, cv2.COLOR_BGR2RGB)
            print(f"Caption: {caption}")
            yield result_rgb
    cap.release()

# Function to predict and annotate an uploaded image
def predict_image(image):
    results = model.predict(source=image, conf=confidence_threshold)
    annotated_image = results[0].plot()
    object_count = len(results[0].boxes)
    
    # Generate caption for the uploaded image
    caption = generate_caption(image)
    
    return annotated_image, f"Objects detected: {object_count}, Caption: {caption}"

# Gradio interface
with gr.Blocks() as demo:
    gr.Markdown("## YOLOv8 Object Detection with Image Captioning (Faster R-CNN)")
    with gr.Tab("Live Video"):
        gr.Markdown("### Real-Time Object Detection and Captioning from Live Stream")
        live_output = gr.Image(label="Live Video with YOLOv8 Annotations", streaming=True)
        live_output.change(fn=process_stream, inputs=None, outputs=live_output)
    with gr.Tab("Upload Image"):
        gr.Markdown("### Object Detection and Captioning from Uploaded Image")
        uploaded_image = gr.Image(type="numpy", label="Upload Image")
        image_output = gr.Image(label="Annotated Image with YOLOv8 Annotations")
        object_count_image = gr.Textbox(label="Object Count and Caption", interactive=False)
        
        uploaded_image.change(fn=predict_image, inputs=uploaded_image, outputs=[image_output, object_count_image])

# Launch the Gradio interface
if __name__ == "__main__":
    if torch.cuda.is_available():
        model.to('cuda')
    demo.queue()
    demo.launch()