import cv2 import torch from transformers import AutoModelForObjectDetection, AutoTokenizer # Load the model and tokenizer model_name = "facebook/detectron2_resnet50" model = AutoModelForObjectDetection.from_pretrained(model_name) tokenizer = AutoTokenizer.from_pretrained(model_name) # Set up the camera cap = cv2.VideoCapture(0) if not cap.isOpened(): print("Failed to open camera.") exit() # Main loop for object detection while True: ret, frame = cap.read() if not ret: print("Failed to capture frame.") break # Preprocess the image image = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB) inputs = tokenizer(image, return_tensors="pt") # Perform object detection outputs = model(**inputs) predicted_boxes = outputs.pred_boxes[0].tensor.cpu().detach().numpy() predicted_labels = outputs.pred_classes[0].cpu().detach().numpy() # Visualize the predictions for box, label in zip(predicted_boxes, predicted_labels): x1, y1, x2, y2 = box.astype(int) cv2.rectangle(frame, (x1, y1), (x2, y2), (0, 255, 0), 2) cv2.putText(frame, str(label), (x1, y1 - 10), cv2.FONT_HERSHEY_SIMPLEX, 0.9, (0, 255, 0), 2) # Display the frame cv2.imshow("Object Detection", frame) # Exit on 'q' key press if cv2.waitKey(1) & 0xFF == ord('q'): break # Release the camera and close the window cap.release() cv2.destroyAllWindows()