File size: 1,507 Bytes
ebffcc9
ac03a61
23c9421
 
1fcf94f
ebffcc9
1fcf94f
3df481e
 
 
 
 
 
ac03a61
ebffcc9
ac03a61
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3df481e
 
ac03a61
 
3df481e
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
    # Track highest score per object
from transformers import pipeline
from PIL import Image

# Load object detection model
MODEL_NAME = "facebook/detr-resnet-50"
detector = pipeline("object-detection", model=MODEL_NAME)

def caption_image(image: Image.Image):
    # Validate input
    if not isinstance(image, Image.Image) or image.mode not in ('RGB', 'L'):
        raise ValueError("Input must be a valid PIL Image in RGB or grayscale format")
    
    # Run object detection
    results = detector(image)
    
    # Track highest score per object
    objects_dict = {}
    for result in results:
        label = result['label']
        score = result['score']
        if label in objects_dict:
            objects_dict[label] = max(objects_dict[label], score)
        else:
            objects_dict[label] = score
    
    # Build structured list of objects
    objects_list = [
        {"label": label, "score": round(score, 2)}
        for label, score in sorted(objects_dict.items(), key=lambda x: x[1], reverse=True)
    ]
    
    # Create readable caption
    detected_objects = [f"{obj['label']} ({obj['score']:.2f})" for obj in objects_list]
    caption = "Detected objects: " + ", ".join(detected_objects) if detected_objects else "No objects detected."
    
    # Highest confidence score
    max_confidence = max(objects_dict.values()) if objects_dict else 0.0
    
    return {
        "caption": caption,
        "objects": objects_list,
        "confidence": round(max_confidence, 2)
    }