# Track highest score per object from transformers import pipeline from PIL import Image # Load object detection model MODEL_NAME = "facebook/detr-resnet-50" detector = pipeline("object-detection", model=MODEL_NAME) def caption_image(image: Image.Image): # Validate input if not isinstance(image, Image.Image) or image.mode not in ('RGB', 'L'): raise ValueError("Input must be a valid PIL Image in RGB or grayscale format") # Run object detection results = detector(image) # Track highest score per object objects_dict = {} for result in results: label = result['label'] score = result['score'] if label in objects_dict: objects_dict[label] = max(objects_dict[label], score) else: objects_dict[label] = score # Build structured list of objects objects_list = [ {"label": label, "score": round(score, 2)} for label, score in sorted(objects_dict.items(), key=lambda x: x[1], reverse=True) ] # Create readable caption detected_objects = [f"{obj['label']} ({obj['score']:.2f})" for obj in objects_list] caption = "Detected objects: " + ", ".join(detected_objects) if detected_objects else "No objects detected." # Highest confidence score max_confidence = max(objects_dict.values()) if objects_dict else 0.0 return { "caption": caption, "objects": objects_list, "confidence": round(max_confidence, 2) }