| import gradio as gr |
| import cv2 |
| from transformers import YolosImageProcessor, YolosForObjectDetection |
| from PIL import Image |
| import torch |
|
|
| |
| model = YolosForObjectDetection.from_pretrained('hustvl/yolos-tiny') |
| image_processor = YolosImageProcessor.from_pretrained("hustvl/yolos-tiny") |
|
|
| def process_frame(frame): |
| |
| frame = cv2.resize(frame, (640, 360)) |
| |
| |
| image = Image.fromarray(cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)) |
| |
| |
| inputs = image_processor(images=image, return_tensors="pt") |
| |
| |
| with torch.no_grad(): |
| outputs = model(**inputs) |
| |
| |
| target_sizes = torch.tensor([image.size[::-1]]) |
| results = image_processor.post_process_object_detection(outputs, threshold=0.9, target_sizes=target_sizes)[0] |
| |
| |
| for score, label, box in zip(results["scores"], results["labels"], results["boxes"]): |
| box = [round(i, 2) for i in box.tolist()] |
| cv2.rectangle(frame, (int(box[0]), int(box[1])), (int(box[2]), int(box[3])), (0, 255, 0), 2) |
| cv2.putText(frame, f"{model.config.id2label[label.item()]}: {round(score.item(), 2)}", |
| (int(box[0]), int(box[1])-10), cv2.FONT_HERSHEY_SIMPLEX, 0.5, (0, 255, 0), 2) |
| |
| return frame |
|
|
| def video_object_detection(video): |
| cap = cv2.VideoCapture(video) |
| processed_frames = [] |
|
|
| while cap.isOpened(): |
| ret, frame = cap.read() |
| if not ret: |
| break |
|
|
| |
| |
| processed_frame = process_frame(frame) |
| processed_frames.append(processed_frame) |
|
|
| cap.release() |
|
|
| |
| height, width, _ = processed_frames[0].shape |
| output_video = cv2.VideoWriter('/tmp/output.mp4', cv2.VideoWriter_fourcc(*'mp4v'), 20, (width, height)) |
|
|
| for frame in processed_frames: |
| output_video.write(frame) |
|
|
| output_video.release() |
|
|
| return '/tmp/output.mp4' |
|
|
| |
| iface = gr.Interface(fn=video_object_detection, inputs="video", outputs="video", title="YOLOs-Tiny Video Detection", live=True) |
| iface.launch() |
|
|
|
|