Spaces:
Sleeping
Sleeping
| # Track highest score per object | |
| from transformers import pipeline | |
| from PIL import Image | |
| # Load object detection model | |
| MODEL_NAME = "facebook/detr-resnet-50" | |
| detector = pipeline("object-detection", model=MODEL_NAME) | |
| def caption_image(image: Image.Image): | |
| # Validate input | |
| if not isinstance(image, Image.Image) or image.mode not in ('RGB', 'L'): | |
| raise ValueError("Input must be a valid PIL Image in RGB or grayscale format") | |
| # Run object detection | |
| results = detector(image) | |
| # Track highest score per object | |
| objects_dict = {} | |
| for result in results: | |
| label = result['label'] | |
| score = result['score'] | |
| if label in objects_dict: | |
| objects_dict[label] = max(objects_dict[label], score) | |
| else: | |
| objects_dict[label] = score | |
| # Build structured list of objects | |
| objects_list = [ | |
| {"label": label, "score": round(score, 2)} | |
| for label, score in sorted(objects_dict.items(), key=lambda x: x[1], reverse=True) | |
| ] | |
| # Create readable caption | |
| detected_objects = [f"{obj['label']} ({obj['score']:.2f})" for obj in objects_list] | |
| caption = "Detected objects: " + ", ".join(detected_objects) if detected_objects else "No objects detected." | |
| # Highest confidence score | |
| max_confidence = max(objects_dict.values()) if objects_dict else 0.0 | |
| return { | |
| "caption": caption, | |
| "objects": objects_list, | |
| "confidence": round(max_confidence, 2) | |
| } | |