import cv2
import torch
from transformers import AutoModelForObjectDetection, AutoTokenizer

# Load the model and tokenizer
model_name = "facebook/detectron2_resnet50"
model = AutoModelForObjectDetection.from_pretrained(model_name)
tokenizer = AutoTokenizer.from_pretrained(model_name)

# Set up the camera
cap = cv2.VideoCapture(0)
if not cap.isOpened():
    print("Failed to open camera.")
    exit()

# Main loop for object detection
while True:
    ret, frame = cap.read()
    if not ret:
        print("Failed to capture frame.")
        break

    # Preprocess the image
    image = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
    inputs = tokenizer(image, return_tensors="pt")

    # Perform object detection
    outputs = model(**inputs)
    predicted_boxes = outputs.pred_boxes[0].tensor.cpu().detach().numpy()
    predicted_labels = outputs.pred_classes[0].cpu().detach().numpy()

    # Visualize the predictions
    for box, label in zip(predicted_boxes, predicted_labels):
        x1, y1, x2, y2 = box.astype(int)
        cv2.rectangle(frame, (x1, y1), (x2, y2), (0, 255, 0), 2)
        cv2.putText(frame, str(label), (x1, y1 - 10), cv2.FONT_HERSHEY_SIMPLEX, 0.9, (0, 255, 0), 2)

    # Display the frame
    cv2.imshow("Object Detection", frame)

    # Exit on 'q' key press
    if cv2.waitKey(1) & 0xFF == ord('q'):
        break

# Release the camera and close the window
cap.release()
cv2.destroyAllWindows()