import cv2 import numpy as np import gradio as gr from ultralytics import YOLO from PIL import Image import torch from torchvision import models, transforms # Load YOLOv8 model for object detection model = YOLO('yolov8n.pt') # Load Faster R-CNN model faster_rcnn = models.detection.fasterrcnn_resnet50_fpn(pretrained=True) faster_rcnn.eval() # Define confidence and IOU thresholds confidence_threshold = 0.25 iou_threshold = 0.45 LOW_RES = (640, 320) # Function to detect objects and draw bounding boxes def detect_and_draw(frame): low_res_frame = cv2.resize(frame, LOW_RES) results = model.predict(source=low_res_frame, conf=confidence_threshold, iou=iou_threshold, verbose=False) scale_x = frame.shape[1] / LOW_RES[0] scale_y = frame.shape[0] / LOW_RES[1] for detection in results[0].boxes: x1, y1, x2, y2 = detection.xyxy[0] * np.array([scale_x, scale_y, scale_x, scale_y]) confidence = detection.conf[0] cls_id = int(detection.cls[0]) label = f"{model.names[cls_id]} {confidence:.2f}" cv2.rectangle(frame, (int(x1), int(y1)), (int(x2), int(y2)), (0, 255, 0), 2) cv2.putText(frame, label, (int(x1), int(y1) - 10), cv2.FONT_HERSHEY_SIMPLEX, 0.9, (0, 255, 0), 2) return frame # Function to generate captions using Faster R-CNN def generate_caption(image_frame): transform = transforms.Compose([ transforms.ToTensor() ]) image_tensor = transform(image_frame).unsqueeze(0) with torch.no_grad(): outputs = faster_rcnn(image_tensor) captions = [] for box, label, score in zip(outputs[0]['boxes'], outputs[0]['labels'], outputs[0]['scores']): if score > confidence_threshold: captions.append(f"Object {label} detected with confidence {score:.2f}") return " ".join(captions) # Define the stream URL for live video stream_url = "https://edge01.london.nginx.hdontap.com/hosb5/ng_showcase-coke_bottle-street_fixed.stream/chunklist_w464099566.m3u8" # Process video stream and generate captions def process_stream(): cap = cv2.VideoCapture(stream_url) if not cap.isOpened(): return None frame_count = 0 while cap.isOpened(): ret, frame = cap.read() if not ret: break frame_count += 1 if frame_count % 10 == 0: # Process every 10th frame for efficiency result = detect_and_draw(frame) caption = generate_caption(frame) result_rgb = cv2.cvtColor(result, cv2.COLOR_BGR2RGB) print(f"Caption: {caption}") yield result_rgb cap.release() # Function to predict and annotate an uploaded image def predict_image(image): results = model.predict(source=image, conf=confidence_threshold) annotated_image = results[0].plot() object_count = len(results[0].boxes) # Generate caption for the uploaded image caption = generate_caption(image) return annotated_image, f"Objects detected: {object_count}, Caption: {caption}" # Gradio interface with gr.Blocks() as demo: gr.Markdown("## YOLOv8 Object Detection with Image Captioning (Faster R-CNN)") with gr.Tab("Live Video"): gr.Markdown("### Real-Time Object Detection and Captioning from Live Stream") live_output = gr.Image(label="Live Video with YOLOv8 Annotations", streaming=True) live_output.change(fn=process_stream, inputs=None, outputs=live_output) with gr.Tab("Upload Image"): gr.Markdown("### Object Detection and Captioning from Uploaded Image") uploaded_image = gr.Image(type="numpy", label="Upload Image") image_output = gr.Image(label="Annotated Image with YOLOv8 Annotations") object_count_image = gr.Textbox(label="Object Count and Caption", interactive=False) uploaded_image.change(fn=predict_image, inputs=uploaded_image, outputs=[image_output, object_count_image]) # Launch the Gradio interface if __name__ == "__main__": if torch.cuda.is_available(): model.to('cuda') demo.queue() demo.launch()