Spaces:
Sleeping
Sleeping
File size: 1,507 Bytes
ebffcc9 ac03a61 23c9421 1fcf94f ebffcc9 1fcf94f 3df481e ac03a61 ebffcc9 ac03a61 3df481e ac03a61 3df481e |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 |
# Track highest score per object
from transformers import pipeline
from PIL import Image
# Load object detection model
MODEL_NAME = "facebook/detr-resnet-50"
detector = pipeline("object-detection", model=MODEL_NAME)
def caption_image(image: Image.Image):
# Validate input
if not isinstance(image, Image.Image) or image.mode not in ('RGB', 'L'):
raise ValueError("Input must be a valid PIL Image in RGB or grayscale format")
# Run object detection
results = detector(image)
# Track highest score per object
objects_dict = {}
for result in results:
label = result['label']
score = result['score']
if label in objects_dict:
objects_dict[label] = max(objects_dict[label], score)
else:
objects_dict[label] = score
# Build structured list of objects
objects_list = [
{"label": label, "score": round(score, 2)}
for label, score in sorted(objects_dict.items(), key=lambda x: x[1], reverse=True)
]
# Create readable caption
detected_objects = [f"{obj['label']} ({obj['score']:.2f})" for obj in objects_list]
caption = "Detected objects: " + ", ".join(detected_objects) if detected_objects else "No objects detected."
# Highest confidence score
max_confidence = max(objects_dict.values()) if objects_dict else 0.0
return {
"caption": caption,
"objects": objects_list,
"confidence": round(max_confidence, 2)
}
|