yolodo / app.py
Dorn4449's picture
Update app.py
8d6fec1 verified
import cv2
import numpy as np
import gradio as gr
from ultralytics import YOLO
from PIL import Image
import torch
from torchvision import models, transforms
# Load YOLOv8 model for object detection
model = YOLO('yolov8n.pt')
# Load Faster R-CNN model
faster_rcnn = models.detection.fasterrcnn_resnet50_fpn(pretrained=True)
faster_rcnn.eval()
# Define confidence and IOU thresholds
confidence_threshold = 0.25
iou_threshold = 0.45
LOW_RES = (640, 320)
# Function to detect objects and draw bounding boxes
def detect_and_draw(frame):
low_res_frame = cv2.resize(frame, LOW_RES)
results = model.predict(source=low_res_frame, conf=confidence_threshold, iou=iou_threshold, verbose=False)
scale_x = frame.shape[1] / LOW_RES[0]
scale_y = frame.shape[0] / LOW_RES[1]
for detection in results[0].boxes:
x1, y1, x2, y2 = detection.xyxy[0] * np.array([scale_x, scale_y, scale_x, scale_y])
confidence = detection.conf[0]
cls_id = int(detection.cls[0])
label = f"{model.names[cls_id]} {confidence:.2f}"
cv2.rectangle(frame, (int(x1), int(y1)), (int(x2), int(y2)), (0, 255, 0), 2)
cv2.putText(frame, label, (int(x1), int(y1) - 10), cv2.FONT_HERSHEY_SIMPLEX, 0.9, (0, 255, 0), 2)
return frame
# Function to generate captions using Faster R-CNN
def generate_caption(image_frame):
transform = transforms.Compose([
transforms.ToTensor()
])
image_tensor = transform(image_frame).unsqueeze(0)
with torch.no_grad():
outputs = faster_rcnn(image_tensor)
captions = []
for box, label, score in zip(outputs[0]['boxes'], outputs[0]['labels'], outputs[0]['scores']):
if score > confidence_threshold:
captions.append(f"Object {label} detected with confidence {score:.2f}")
return " ".join(captions)
# Define the stream URL for live video
stream_url = "https://edge01.london.nginx.hdontap.com/hosb5/ng_showcase-coke_bottle-street_fixed.stream/chunklist_w464099566.m3u8"
# Process video stream and generate captions
def process_stream():
cap = cv2.VideoCapture(stream_url)
if not cap.isOpened():
return None
frame_count = 0
while cap.isOpened():
ret, frame = cap.read()
if not ret:
break
frame_count += 1
if frame_count % 10 == 0: # Process every 10th frame for efficiency
result = detect_and_draw(frame)
caption = generate_caption(frame)
result_rgb = cv2.cvtColor(result, cv2.COLOR_BGR2RGB)
print(f"Caption: {caption}")
yield result_rgb
cap.release()
# Function to predict and annotate an uploaded image
def predict_image(image):
results = model.predict(source=image, conf=confidence_threshold)
annotated_image = results[0].plot()
object_count = len(results[0].boxes)
# Generate caption for the uploaded image
caption = generate_caption(image)
return annotated_image, f"Objects detected: {object_count}, Caption: {caption}"
# Gradio interface
with gr.Blocks() as demo:
gr.Markdown("## YOLOv8 Object Detection with Image Captioning (Faster R-CNN)")
with gr.Tab("Live Video"):
gr.Markdown("### Real-Time Object Detection and Captioning from Live Stream")
live_output = gr.Image(label="Live Video with YOLOv8 Annotations", streaming=True)
live_output.change(fn=process_stream, inputs=None, outputs=live_output)
with gr.Tab("Upload Image"):
gr.Markdown("### Object Detection and Captioning from Uploaded Image")
uploaded_image = gr.Image(type="numpy", label="Upload Image")
image_output = gr.Image(label="Annotated Image with YOLOv8 Annotations")
object_count_image = gr.Textbox(label="Object Count and Caption", interactive=False)
uploaded_image.change(fn=predict_image, inputs=uploaded_image, outputs=[image_output, object_count_image])
# Launch the Gradio interface
if __name__ == "__main__":
if torch.cuda.is_available():
model.to('cuda')
demo.queue()
demo.launch()