import requests from PIL import Image, ImageDraw, ImageFont import torch import os from transformers import Owlv2Processor, Owlv2ForObjectDetection # Load the model and processor processor = Owlv2Processor.from_pretrained("google/owlv2-large-patch14-ensemble") model = Owlv2ForObjectDetection.from_pretrained("google/owlv2-large-patch14-ensemble") # Option 1: Load image from local file image_path = "image.jpg" # Replace with your image path image = Image.open(image_path) # Define what you want to detect text_labels = [["a person with a hat"]] # Process the image and text inputs = processor(text=text_labels, images=image, return_tensors="pt") outputs = model(**inputs) # Target image sizes (height, width) to rescale box predictions [batch_size, 2] target_sizes = torch.tensor([(image.height, image.width)]) # Convert outputs (bounding boxes and class logits) to Pascal VOC format (xmin, ymin, xmax, ymax) results = processor.post_process_grounded_object_detection( outputs=outputs, target_sizes=target_sizes, threshold=0.1, text_labels=text_labels ) # Retrieve predictions for the first image result = results[0] boxes, scores, text_labels_detected = result["boxes"], result["scores"], result["text_labels"] # Create a copy of the original image for drawing output_image = image.copy() draw = ImageDraw.Draw(output_image) # Try to use a default font, fallback to default if not available try: font = ImageFont.truetype("arial.ttf", 16) except OSError: font = ImageFont.load_default() # Colors for different detections colors = ["red", "blue", "green", "orange", "purple", "yellow", "pink", "cyan"] print("Detection Results:") print("-" * 50) # Draw bounding boxes and labels for i, (box, score, text_label) in enumerate(zip(boxes, scores, text_labels_detected)): box = [round(i, 2) for i in box.tolist()] confidence = round(score.item(), 3) print(f"Detected {text_label} with confidence {confidence} at location {box}") # Get coordinates xmin, ymin, xmax, ymax = box # Choose color color = colors[i % len(colors)] # Draw bounding box draw.rectangle([xmin, ymin, xmax, ymax], outline=color, width=3) # Draw label with confidence label_text = f"{text_label}: {confidence}" # Get text bounding box for background bbox = draw.textbbox((xmin, ymin - 25), label_text, font=font) # Draw background rectangle for text draw.rectangle([bbox[0]-2, bbox[1]-2, bbox[2]+2, bbox[3]+2], fill=color) # Draw text draw.text((xmin, ymin - 25), label_text, fill="white", font=font) # Save the output image output_path = "output_img.jpg" output_image.save(output_path) print(f"\nOutput image saved as: {output_path}")