File size: 4,064 Bytes
6e11525 c690171 0e55c4c c690171 6f8ba8b a9c8ff8 0e55c4c 9fb5431 336db70 152382c a9c8ff8 6f8ba8b 9fb5431 6e11525 6f8ba8b c690171 6f8ba8b c690171 6f8ba8b 6e11525 abf7f16 8d6fec1 a9c8ff8 abf7f16 a9c8ff8 8d6fec1 07856ef 8d6fec1 ee1b855 6f8ba8b b596a08 6f8ba8b b596a08 6f8ba8b 8d6fec1 6f8ba8b b596a08 6f8ba8b 8d6fec1 6f8ba8b 9fb5431 3cb8dc9 a9c8ff8 6f8ba8b 558e5db 3cb8dc9 6e11525 6f8ba8b a1f4075 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 | import cv2
import numpy as np
import gradio as gr
from ultralytics import YOLO
from PIL import Image
import torch
from torchvision import models, transforms
# Load YOLOv8 model for object detection
model = YOLO('yolov8n.pt')
# Load Faster R-CNN model
faster_rcnn = models.detection.fasterrcnn_resnet50_fpn(pretrained=True)
faster_rcnn.eval()
# Define confidence and IOU thresholds
confidence_threshold = 0.25
iou_threshold = 0.45
LOW_RES = (640, 320)
# Function to detect objects and draw bounding boxes
def detect_and_draw(frame):
low_res_frame = cv2.resize(frame, LOW_RES)
results = model.predict(source=low_res_frame, conf=confidence_threshold, iou=iou_threshold, verbose=False)
scale_x = frame.shape[1] / LOW_RES[0]
scale_y = frame.shape[0] / LOW_RES[1]
for detection in results[0].boxes:
x1, y1, x2, y2 = detection.xyxy[0] * np.array([scale_x, scale_y, scale_x, scale_y])
confidence = detection.conf[0]
cls_id = int(detection.cls[0])
label = f"{model.names[cls_id]} {confidence:.2f}"
cv2.rectangle(frame, (int(x1), int(y1)), (int(x2), int(y2)), (0, 255, 0), 2)
cv2.putText(frame, label, (int(x1), int(y1) - 10), cv2.FONT_HERSHEY_SIMPLEX, 0.9, (0, 255, 0), 2)
return frame
# Function to generate captions using Faster R-CNN
def generate_caption(image_frame):
transform = transforms.Compose([
transforms.ToTensor()
])
image_tensor = transform(image_frame).unsqueeze(0)
with torch.no_grad():
outputs = faster_rcnn(image_tensor)
captions = []
for box, label, score in zip(outputs[0]['boxes'], outputs[0]['labels'], outputs[0]['scores']):
if score > confidence_threshold:
captions.append(f"Object {label} detected with confidence {score:.2f}")
return " ".join(captions)
# Define the stream URL for live video
stream_url = "https://edge01.london.nginx.hdontap.com/hosb5/ng_showcase-coke_bottle-street_fixed.stream/chunklist_w464099566.m3u8"
# Process video stream and generate captions
def process_stream():
cap = cv2.VideoCapture(stream_url)
if not cap.isOpened():
return None
frame_count = 0
while cap.isOpened():
ret, frame = cap.read()
if not ret:
break
frame_count += 1
if frame_count % 10 == 0: # Process every 10th frame for efficiency
result = detect_and_draw(frame)
caption = generate_caption(frame)
result_rgb = cv2.cvtColor(result, cv2.COLOR_BGR2RGB)
print(f"Caption: {caption}")
yield result_rgb
cap.release()
# Function to predict and annotate an uploaded image
def predict_image(image):
results = model.predict(source=image, conf=confidence_threshold)
annotated_image = results[0].plot()
object_count = len(results[0].boxes)
# Generate caption for the uploaded image
caption = generate_caption(image)
return annotated_image, f"Objects detected: {object_count}, Caption: {caption}"
# Gradio interface
with gr.Blocks() as demo:
gr.Markdown("## YOLOv8 Object Detection with Image Captioning (Faster R-CNN)")
with gr.Tab("Live Video"):
gr.Markdown("### Real-Time Object Detection and Captioning from Live Stream")
live_output = gr.Image(label="Live Video with YOLOv8 Annotations", streaming=True)
live_output.change(fn=process_stream, inputs=None, outputs=live_output)
with gr.Tab("Upload Image"):
gr.Markdown("### Object Detection and Captioning from Uploaded Image")
uploaded_image = gr.Image(type="numpy", label="Upload Image")
image_output = gr.Image(label="Annotated Image with YOLOv8 Annotations")
object_count_image = gr.Textbox(label="Object Count and Caption", interactive=False)
uploaded_image.change(fn=predict_image, inputs=uploaded_image, outputs=[image_output, object_count_image])
# Launch the Gradio interface
if __name__ == "__main__":
if torch.cuda.is_available():
model.to('cuda')
demo.queue()
demo.launch() |