| | import cv2 |
| | import numpy as np |
| | import gradio as gr |
| | from ultralytics import YOLO |
| | from PIL import Image |
| | import torch |
| | from torchvision import models, transforms |
| |
|
| | |
| | model = YOLO('yolov8n.pt') |
| |
|
| | |
| | faster_rcnn = models.detection.fasterrcnn_resnet50_fpn(pretrained=True) |
| | faster_rcnn.eval() |
| |
|
| | |
| | confidence_threshold = 0.25 |
| | iou_threshold = 0.45 |
| | LOW_RES = (640, 320) |
| |
|
| | |
| | def detect_and_draw(frame): |
| | low_res_frame = cv2.resize(frame, LOW_RES) |
| | results = model.predict(source=low_res_frame, conf=confidence_threshold, iou=iou_threshold, verbose=False) |
| | scale_x = frame.shape[1] / LOW_RES[0] |
| | scale_y = frame.shape[0] / LOW_RES[1] |
| | for detection in results[0].boxes: |
| | x1, y1, x2, y2 = detection.xyxy[0] * np.array([scale_x, scale_y, scale_x, scale_y]) |
| | confidence = detection.conf[0] |
| | cls_id = int(detection.cls[0]) |
| | label = f"{model.names[cls_id]} {confidence:.2f}" |
| | cv2.rectangle(frame, (int(x1), int(y1)), (int(x2), int(y2)), (0, 255, 0), 2) |
| | cv2.putText(frame, label, (int(x1), int(y1) - 10), cv2.FONT_HERSHEY_SIMPLEX, 0.9, (0, 255, 0), 2) |
| | return frame |
| |
|
| | |
| | def generate_caption(image_frame): |
| | transform = transforms.Compose([ |
| | transforms.ToTensor() |
| | ]) |
| | image_tensor = transform(image_frame).unsqueeze(0) |
| |
|
| | with torch.no_grad(): |
| | outputs = faster_rcnn(image_tensor) |
| | |
| | captions = [] |
| | for box, label, score in zip(outputs[0]['boxes'], outputs[0]['labels'], outputs[0]['scores']): |
| | if score > confidence_threshold: |
| | captions.append(f"Object {label} detected with confidence {score:.2f}") |
| | |
| | return " ".join(captions) |
| |
|
| | |
| | stream_url = "https://edge01.london.nginx.hdontap.com/hosb5/ng_showcase-coke_bottle-street_fixed.stream/chunklist_w464099566.m3u8" |
| |
|
| | |
| | def process_stream(): |
| | cap = cv2.VideoCapture(stream_url) |
| | if not cap.isOpened(): |
| | return None |
| | frame_count = 0 |
| | while cap.isOpened(): |
| | ret, frame = cap.read() |
| | if not ret: |
| | break |
| | frame_count += 1 |
| | if frame_count % 10 == 0: |
| | result = detect_and_draw(frame) |
| | caption = generate_caption(frame) |
| | result_rgb = cv2.cvtColor(result, cv2.COLOR_BGR2RGB) |
| | print(f"Caption: {caption}") |
| | yield result_rgb |
| | cap.release() |
| |
|
| | |
| | def predict_image(image): |
| | results = model.predict(source=image, conf=confidence_threshold) |
| | annotated_image = results[0].plot() |
| | object_count = len(results[0].boxes) |
| | |
| | |
| | caption = generate_caption(image) |
| | |
| | return annotated_image, f"Objects detected: {object_count}, Caption: {caption}" |
| |
|
| | |
| | with gr.Blocks() as demo: |
| | gr.Markdown("## YOLOv8 Object Detection with Image Captioning (Faster R-CNN)") |
| | with gr.Tab("Live Video"): |
| | gr.Markdown("### Real-Time Object Detection and Captioning from Live Stream") |
| | live_output = gr.Image(label="Live Video with YOLOv8 Annotations", streaming=True) |
| | live_output.change(fn=process_stream, inputs=None, outputs=live_output) |
| | with gr.Tab("Upload Image"): |
| | gr.Markdown("### Object Detection and Captioning from Uploaded Image") |
| | uploaded_image = gr.Image(type="numpy", label="Upload Image") |
| | image_output = gr.Image(label="Annotated Image with YOLOv8 Annotations") |
| | object_count_image = gr.Textbox(label="Object Count and Caption", interactive=False) |
| | |
| | uploaded_image.change(fn=predict_image, inputs=uploaded_image, outputs=[image_output, object_count_image]) |
| |
|
| | |
| | if __name__ == "__main__": |
| | if torch.cuda.is_available(): |
| | model.to('cuda') |
| | demo.queue() |
| | demo.launch() |